xref: /openbmc/linux/mm/hugetlb.c (revision 31caf665)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
36d49e352SNadia Yvette Chambers  * (C) Nadia Yvette Chambers, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/list.h>
61da177e4SLinus Torvalds #include <linux/init.h>
71da177e4SLinus Torvalds #include <linux/module.h>
81da177e4SLinus Torvalds #include <linux/mm.h>
9e1759c21SAlexey Dobriyan #include <linux/seq_file.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
12cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h>
131da177e4SLinus Torvalds #include <linux/nodemask.h>
1463551ae0SDavid Gibson #include <linux/pagemap.h>
155da7ca86SChristoph Lameter #include <linux/mempolicy.h>
16aea47ff3SChristoph Lameter #include <linux/cpuset.h>
173935baa9SDavid Gibson #include <linux/mutex.h>
18aa888a74SAndi Kleen #include <linux/bootmem.h>
19a3437870SNishanth Aravamudan #include <linux/sysfs.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
210fe6e20bSNaoya Horiguchi #include <linux/rmap.h>
22fd6a03edSNaoya Horiguchi #include <linux/swap.h>
23fd6a03edSNaoya Horiguchi #include <linux/swapops.h>
24d6606683SLinus Torvalds 
2563551ae0SDavid Gibson #include <asm/page.h>
2663551ae0SDavid Gibson #include <asm/pgtable.h>
2724669e58SAneesh Kumar K.V #include <asm/tlb.h>
2863551ae0SDavid Gibson 
2924669e58SAneesh Kumar K.V #include <linux/io.h>
3063551ae0SDavid Gibson #include <linux/hugetlb.h>
319dd540e2SAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
329a305230SLee Schermerhorn #include <linux/node.h>
337835e98bSNick Piggin #include "internal.h"
341da177e4SLinus Torvalds 
351da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
36396faf03SMel Gorman static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
37396faf03SMel Gorman unsigned long hugepages_treat_as_movable;
38a5516438SAndi Kleen 
39c3f38a38SAneesh Kumar K.V int hugetlb_max_hstate __read_mostly;
40e5ff2159SAndi Kleen unsigned int default_hstate_idx;
41e5ff2159SAndi Kleen struct hstate hstates[HUGE_MAX_HSTATE];
42e5ff2159SAndi Kleen 
4353ba51d2SJon Tollefson __initdata LIST_HEAD(huge_boot_pages);
4453ba51d2SJon Tollefson 
45e5ff2159SAndi Kleen /* for command line parsing */
46e5ff2159SAndi Kleen static struct hstate * __initdata parsed_hstate;
47e5ff2159SAndi Kleen static unsigned long __initdata default_hstate_max_huge_pages;
48e11bfbfcSNick Piggin static unsigned long __initdata default_hstate_size;
49e5ff2159SAndi Kleen 
503935baa9SDavid Gibson /*
5131caf665SNaoya Horiguchi  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
5231caf665SNaoya Horiguchi  * free_huge_pages, and surplus_huge_pages.
533935baa9SDavid Gibson  */
54c3f38a38SAneesh Kumar K.V DEFINE_SPINLOCK(hugetlb_lock);
550bd0f9fbSEric Paris 
5690481622SDavid Gibson static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
5790481622SDavid Gibson {
5890481622SDavid Gibson 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
5990481622SDavid Gibson 
6090481622SDavid Gibson 	spin_unlock(&spool->lock);
6190481622SDavid Gibson 
6290481622SDavid Gibson 	/* If no pages are used, and no other handles to the subpool
6390481622SDavid Gibson 	 * remain, free the subpool the subpool remain */
6490481622SDavid Gibson 	if (free)
6590481622SDavid Gibson 		kfree(spool);
6690481622SDavid Gibson }
6790481622SDavid Gibson 
6890481622SDavid Gibson struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
6990481622SDavid Gibson {
7090481622SDavid Gibson 	struct hugepage_subpool *spool;
7190481622SDavid Gibson 
7290481622SDavid Gibson 	spool = kmalloc(sizeof(*spool), GFP_KERNEL);
7390481622SDavid Gibson 	if (!spool)
7490481622SDavid Gibson 		return NULL;
7590481622SDavid Gibson 
7690481622SDavid Gibson 	spin_lock_init(&spool->lock);
7790481622SDavid Gibson 	spool->count = 1;
7890481622SDavid Gibson 	spool->max_hpages = nr_blocks;
7990481622SDavid Gibson 	spool->used_hpages = 0;
8090481622SDavid Gibson 
8190481622SDavid Gibson 	return spool;
8290481622SDavid Gibson }
8390481622SDavid Gibson 
8490481622SDavid Gibson void hugepage_put_subpool(struct hugepage_subpool *spool)
8590481622SDavid Gibson {
8690481622SDavid Gibson 	spin_lock(&spool->lock);
8790481622SDavid Gibson 	BUG_ON(!spool->count);
8890481622SDavid Gibson 	spool->count--;
8990481622SDavid Gibson 	unlock_or_release_subpool(spool);
9090481622SDavid Gibson }
9190481622SDavid Gibson 
9290481622SDavid Gibson static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
9390481622SDavid Gibson 				      long delta)
9490481622SDavid Gibson {
9590481622SDavid Gibson 	int ret = 0;
9690481622SDavid Gibson 
9790481622SDavid Gibson 	if (!spool)
9890481622SDavid Gibson 		return 0;
9990481622SDavid Gibson 
10090481622SDavid Gibson 	spin_lock(&spool->lock);
10190481622SDavid Gibson 	if ((spool->used_hpages + delta) <= spool->max_hpages) {
10290481622SDavid Gibson 		spool->used_hpages += delta;
10390481622SDavid Gibson 	} else {
10490481622SDavid Gibson 		ret = -ENOMEM;
10590481622SDavid Gibson 	}
10690481622SDavid Gibson 	spin_unlock(&spool->lock);
10790481622SDavid Gibson 
10890481622SDavid Gibson 	return ret;
10990481622SDavid Gibson }
11090481622SDavid Gibson 
11190481622SDavid Gibson static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
11290481622SDavid Gibson 				       long delta)
11390481622SDavid Gibson {
11490481622SDavid Gibson 	if (!spool)
11590481622SDavid Gibson 		return;
11690481622SDavid Gibson 
11790481622SDavid Gibson 	spin_lock(&spool->lock);
11890481622SDavid Gibson 	spool->used_hpages -= delta;
11990481622SDavid Gibson 	/* If hugetlbfs_put_super couldn't free spool due to
12090481622SDavid Gibson 	* an outstanding quota reference, free it now. */
12190481622SDavid Gibson 	unlock_or_release_subpool(spool);
12290481622SDavid Gibson }
12390481622SDavid Gibson 
12490481622SDavid Gibson static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
12590481622SDavid Gibson {
12690481622SDavid Gibson 	return HUGETLBFS_SB(inode->i_sb)->spool;
12790481622SDavid Gibson }
12890481622SDavid Gibson 
12990481622SDavid Gibson static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
13090481622SDavid Gibson {
131496ad9aaSAl Viro 	return subpool_inode(file_inode(vma->vm_file));
13290481622SDavid Gibson }
13390481622SDavid Gibson 
134e7c4b0bfSAndy Whitcroft /*
13596822904SAndy Whitcroft  * Region tracking -- allows tracking of reservations and instantiated pages
13696822904SAndy Whitcroft  *                    across the pages in a mapping.
13784afd99bSAndy Whitcroft  *
13884afd99bSAndy Whitcroft  * The region data structures are protected by a combination of the mmap_sem
139c748c262SJoonsoo Kim  * and the hugetlb_instantiation_mutex.  To access or modify a region the caller
14084afd99bSAndy Whitcroft  * must either hold the mmap_sem for write, or the mmap_sem for read and
141c748c262SJoonsoo Kim  * the hugetlb_instantiation_mutex:
14284afd99bSAndy Whitcroft  *
14384afd99bSAndy Whitcroft  *	down_write(&mm->mmap_sem);
14484afd99bSAndy Whitcroft  * or
14584afd99bSAndy Whitcroft  *	down_read(&mm->mmap_sem);
14684afd99bSAndy Whitcroft  *	mutex_lock(&hugetlb_instantiation_mutex);
14796822904SAndy Whitcroft  */
14896822904SAndy Whitcroft struct file_region {
14996822904SAndy Whitcroft 	struct list_head link;
15096822904SAndy Whitcroft 	long from;
15196822904SAndy Whitcroft 	long to;
15296822904SAndy Whitcroft };
15396822904SAndy Whitcroft 
15496822904SAndy Whitcroft static long region_add(struct list_head *head, long f, long t)
15596822904SAndy Whitcroft {
15696822904SAndy Whitcroft 	struct file_region *rg, *nrg, *trg;
15796822904SAndy Whitcroft 
15896822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
15996822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
16096822904SAndy Whitcroft 		if (f <= rg->to)
16196822904SAndy Whitcroft 			break;
16296822904SAndy Whitcroft 
16396822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
16496822904SAndy Whitcroft 	if (f > rg->from)
16596822904SAndy Whitcroft 		f = rg->from;
16696822904SAndy Whitcroft 
16796822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
16896822904SAndy Whitcroft 	nrg = rg;
16996822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
17096822904SAndy Whitcroft 		if (&rg->link == head)
17196822904SAndy Whitcroft 			break;
17296822904SAndy Whitcroft 		if (rg->from > t)
17396822904SAndy Whitcroft 			break;
17496822904SAndy Whitcroft 
17596822904SAndy Whitcroft 		/* If this area reaches higher then extend our area to
17696822904SAndy Whitcroft 		 * include it completely.  If this is not the first area
17796822904SAndy Whitcroft 		 * which we intend to reuse, free it. */
17896822904SAndy Whitcroft 		if (rg->to > t)
17996822904SAndy Whitcroft 			t = rg->to;
18096822904SAndy Whitcroft 		if (rg != nrg) {
18196822904SAndy Whitcroft 			list_del(&rg->link);
18296822904SAndy Whitcroft 			kfree(rg);
18396822904SAndy Whitcroft 		}
18496822904SAndy Whitcroft 	}
18596822904SAndy Whitcroft 	nrg->from = f;
18696822904SAndy Whitcroft 	nrg->to = t;
18796822904SAndy Whitcroft 	return 0;
18896822904SAndy Whitcroft }
18996822904SAndy Whitcroft 
19096822904SAndy Whitcroft static long region_chg(struct list_head *head, long f, long t)
19196822904SAndy Whitcroft {
19296822904SAndy Whitcroft 	struct file_region *rg, *nrg;
19396822904SAndy Whitcroft 	long chg = 0;
19496822904SAndy Whitcroft 
19596822904SAndy Whitcroft 	/* Locate the region we are before or in. */
19696822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
19796822904SAndy Whitcroft 		if (f <= rg->to)
19896822904SAndy Whitcroft 			break;
19996822904SAndy Whitcroft 
20096822904SAndy Whitcroft 	/* If we are below the current region then a new region is required.
20196822904SAndy Whitcroft 	 * Subtle, allocate a new region at the position but make it zero
20296822904SAndy Whitcroft 	 * size such that we can guarantee to record the reservation. */
20396822904SAndy Whitcroft 	if (&rg->link == head || t < rg->from) {
20496822904SAndy Whitcroft 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
20596822904SAndy Whitcroft 		if (!nrg)
20696822904SAndy Whitcroft 			return -ENOMEM;
20796822904SAndy Whitcroft 		nrg->from = f;
20896822904SAndy Whitcroft 		nrg->to   = f;
20996822904SAndy Whitcroft 		INIT_LIST_HEAD(&nrg->link);
21096822904SAndy Whitcroft 		list_add(&nrg->link, rg->link.prev);
21196822904SAndy Whitcroft 
21296822904SAndy Whitcroft 		return t - f;
21396822904SAndy Whitcroft 	}
21496822904SAndy Whitcroft 
21596822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
21696822904SAndy Whitcroft 	if (f > rg->from)
21796822904SAndy Whitcroft 		f = rg->from;
21896822904SAndy Whitcroft 	chg = t - f;
21996822904SAndy Whitcroft 
22096822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
22196822904SAndy Whitcroft 	list_for_each_entry(rg, rg->link.prev, link) {
22296822904SAndy Whitcroft 		if (&rg->link == head)
22396822904SAndy Whitcroft 			break;
22496822904SAndy Whitcroft 		if (rg->from > t)
22596822904SAndy Whitcroft 			return chg;
22696822904SAndy Whitcroft 
22725985edcSLucas De Marchi 		/* We overlap with this area, if it extends further than
22896822904SAndy Whitcroft 		 * us then we must extend ourselves.  Account for its
22996822904SAndy Whitcroft 		 * existing reservation. */
23096822904SAndy Whitcroft 		if (rg->to > t) {
23196822904SAndy Whitcroft 			chg += rg->to - t;
23296822904SAndy Whitcroft 			t = rg->to;
23396822904SAndy Whitcroft 		}
23496822904SAndy Whitcroft 		chg -= rg->to - rg->from;
23596822904SAndy Whitcroft 	}
23696822904SAndy Whitcroft 	return chg;
23796822904SAndy Whitcroft }
23896822904SAndy Whitcroft 
23996822904SAndy Whitcroft static long region_truncate(struct list_head *head, long end)
24096822904SAndy Whitcroft {
24196822904SAndy Whitcroft 	struct file_region *rg, *trg;
24296822904SAndy Whitcroft 	long chg = 0;
24396822904SAndy Whitcroft 
24496822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
24596822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
24696822904SAndy Whitcroft 		if (end <= rg->to)
24796822904SAndy Whitcroft 			break;
24896822904SAndy Whitcroft 	if (&rg->link == head)
24996822904SAndy Whitcroft 		return 0;
25096822904SAndy Whitcroft 
25196822904SAndy Whitcroft 	/* If we are in the middle of a region then adjust it. */
25296822904SAndy Whitcroft 	if (end > rg->from) {
25396822904SAndy Whitcroft 		chg = rg->to - end;
25496822904SAndy Whitcroft 		rg->to = end;
25596822904SAndy Whitcroft 		rg = list_entry(rg->link.next, typeof(*rg), link);
25696822904SAndy Whitcroft 	}
25796822904SAndy Whitcroft 
25896822904SAndy Whitcroft 	/* Drop any remaining regions. */
25996822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
26096822904SAndy Whitcroft 		if (&rg->link == head)
26196822904SAndy Whitcroft 			break;
26296822904SAndy Whitcroft 		chg += rg->to - rg->from;
26396822904SAndy Whitcroft 		list_del(&rg->link);
26496822904SAndy Whitcroft 		kfree(rg);
26596822904SAndy Whitcroft 	}
26696822904SAndy Whitcroft 	return chg;
26796822904SAndy Whitcroft }
26896822904SAndy Whitcroft 
26984afd99bSAndy Whitcroft static long region_count(struct list_head *head, long f, long t)
27084afd99bSAndy Whitcroft {
27184afd99bSAndy Whitcroft 	struct file_region *rg;
27284afd99bSAndy Whitcroft 	long chg = 0;
27384afd99bSAndy Whitcroft 
27484afd99bSAndy Whitcroft 	/* Locate each segment we overlap with, and count that overlap. */
27584afd99bSAndy Whitcroft 	list_for_each_entry(rg, head, link) {
276f2135a4aSWang Sheng-Hui 		long seg_from;
277f2135a4aSWang Sheng-Hui 		long seg_to;
27884afd99bSAndy Whitcroft 
27984afd99bSAndy Whitcroft 		if (rg->to <= f)
28084afd99bSAndy Whitcroft 			continue;
28184afd99bSAndy Whitcroft 		if (rg->from >= t)
28284afd99bSAndy Whitcroft 			break;
28384afd99bSAndy Whitcroft 
28484afd99bSAndy Whitcroft 		seg_from = max(rg->from, f);
28584afd99bSAndy Whitcroft 		seg_to = min(rg->to, t);
28684afd99bSAndy Whitcroft 
28784afd99bSAndy Whitcroft 		chg += seg_to - seg_from;
28884afd99bSAndy Whitcroft 	}
28984afd99bSAndy Whitcroft 
29084afd99bSAndy Whitcroft 	return chg;
29184afd99bSAndy Whitcroft }
29284afd99bSAndy Whitcroft 
29396822904SAndy Whitcroft /*
294e7c4b0bfSAndy Whitcroft  * Convert the address within this vma to the page offset within
295e7c4b0bfSAndy Whitcroft  * the mapping, in pagecache page units; huge pages here.
296e7c4b0bfSAndy Whitcroft  */
297a5516438SAndi Kleen static pgoff_t vma_hugecache_offset(struct hstate *h,
298a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
299e7c4b0bfSAndy Whitcroft {
300a5516438SAndi Kleen 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
301a5516438SAndi Kleen 			(vma->vm_pgoff >> huge_page_order(h));
302e7c4b0bfSAndy Whitcroft }
303e7c4b0bfSAndy Whitcroft 
3040fe6e20bSNaoya Horiguchi pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
3050fe6e20bSNaoya Horiguchi 				     unsigned long address)
3060fe6e20bSNaoya Horiguchi {
3070fe6e20bSNaoya Horiguchi 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
3080fe6e20bSNaoya Horiguchi }
3090fe6e20bSNaoya Horiguchi 
31084afd99bSAndy Whitcroft /*
31108fba699SMel Gorman  * Return the size of the pages allocated when backing a VMA. In the majority
31208fba699SMel Gorman  * cases this will be same size as used by the page table entries.
31308fba699SMel Gorman  */
31408fba699SMel Gorman unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
31508fba699SMel Gorman {
31608fba699SMel Gorman 	struct hstate *hstate;
31708fba699SMel Gorman 
31808fba699SMel Gorman 	if (!is_vm_hugetlb_page(vma))
31908fba699SMel Gorman 		return PAGE_SIZE;
32008fba699SMel Gorman 
32108fba699SMel Gorman 	hstate = hstate_vma(vma);
32208fba699SMel Gorman 
3232415cf12SWanpeng Li 	return 1UL << huge_page_shift(hstate);
32408fba699SMel Gorman }
325f340ca0fSJoerg Roedel EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
32608fba699SMel Gorman 
32708fba699SMel Gorman /*
3283340289dSMel Gorman  * Return the page size being used by the MMU to back a VMA. In the majority
3293340289dSMel Gorman  * of cases, the page size used by the kernel matches the MMU size. On
3303340289dSMel Gorman  * architectures where it differs, an architecture-specific version of this
3313340289dSMel Gorman  * function is required.
3323340289dSMel Gorman  */
3333340289dSMel Gorman #ifndef vma_mmu_pagesize
3343340289dSMel Gorman unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
3353340289dSMel Gorman {
3363340289dSMel Gorman 	return vma_kernel_pagesize(vma);
3373340289dSMel Gorman }
3383340289dSMel Gorman #endif
3393340289dSMel Gorman 
3403340289dSMel Gorman /*
34184afd99bSAndy Whitcroft  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
34284afd99bSAndy Whitcroft  * bits of the reservation map pointer, which are always clear due to
34384afd99bSAndy Whitcroft  * alignment.
34484afd99bSAndy Whitcroft  */
34584afd99bSAndy Whitcroft #define HPAGE_RESV_OWNER    (1UL << 0)
34684afd99bSAndy Whitcroft #define HPAGE_RESV_UNMAPPED (1UL << 1)
34704f2cbe3SMel Gorman #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
34884afd99bSAndy Whitcroft 
349a1e78772SMel Gorman /*
350a1e78772SMel Gorman  * These helpers are used to track how many pages are reserved for
351a1e78772SMel Gorman  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
352a1e78772SMel Gorman  * is guaranteed to have their future faults succeed.
353a1e78772SMel Gorman  *
354a1e78772SMel Gorman  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
355a1e78772SMel Gorman  * the reserve counters are updated with the hugetlb_lock held. It is safe
356a1e78772SMel Gorman  * to reset the VMA at fork() time as it is not in use yet and there is no
357a1e78772SMel Gorman  * chance of the global counters getting corrupted as a result of the values.
35884afd99bSAndy Whitcroft  *
35984afd99bSAndy Whitcroft  * The private mapping reservation is represented in a subtly different
36084afd99bSAndy Whitcroft  * manner to a shared mapping.  A shared mapping has a region map associated
36184afd99bSAndy Whitcroft  * with the underlying file, this region map represents the backing file
36284afd99bSAndy Whitcroft  * pages which have ever had a reservation assigned which this persists even
36384afd99bSAndy Whitcroft  * after the page is instantiated.  A private mapping has a region map
36484afd99bSAndy Whitcroft  * associated with the original mmap which is attached to all VMAs which
36584afd99bSAndy Whitcroft  * reference it, this region map represents those offsets which have consumed
36684afd99bSAndy Whitcroft  * reservation ie. where pages have been instantiated.
367a1e78772SMel Gorman  */
368e7c4b0bfSAndy Whitcroft static unsigned long get_vma_private_data(struct vm_area_struct *vma)
369e7c4b0bfSAndy Whitcroft {
370e7c4b0bfSAndy Whitcroft 	return (unsigned long)vma->vm_private_data;
371e7c4b0bfSAndy Whitcroft }
372e7c4b0bfSAndy Whitcroft 
373e7c4b0bfSAndy Whitcroft static void set_vma_private_data(struct vm_area_struct *vma,
374e7c4b0bfSAndy Whitcroft 							unsigned long value)
375e7c4b0bfSAndy Whitcroft {
376e7c4b0bfSAndy Whitcroft 	vma->vm_private_data = (void *)value;
377e7c4b0bfSAndy Whitcroft }
378e7c4b0bfSAndy Whitcroft 
37984afd99bSAndy Whitcroft struct resv_map {
38084afd99bSAndy Whitcroft 	struct kref refs;
38184afd99bSAndy Whitcroft 	struct list_head regions;
38284afd99bSAndy Whitcroft };
38384afd99bSAndy Whitcroft 
3842a4b3dedSHarvey Harrison static struct resv_map *resv_map_alloc(void)
38584afd99bSAndy Whitcroft {
38684afd99bSAndy Whitcroft 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
38784afd99bSAndy Whitcroft 	if (!resv_map)
38884afd99bSAndy Whitcroft 		return NULL;
38984afd99bSAndy Whitcroft 
39084afd99bSAndy Whitcroft 	kref_init(&resv_map->refs);
39184afd99bSAndy Whitcroft 	INIT_LIST_HEAD(&resv_map->regions);
39284afd99bSAndy Whitcroft 
39384afd99bSAndy Whitcroft 	return resv_map;
39484afd99bSAndy Whitcroft }
39584afd99bSAndy Whitcroft 
3962a4b3dedSHarvey Harrison static void resv_map_release(struct kref *ref)
39784afd99bSAndy Whitcroft {
39884afd99bSAndy Whitcroft 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
39984afd99bSAndy Whitcroft 
40084afd99bSAndy Whitcroft 	/* Clear out any active regions before we release the map. */
40184afd99bSAndy Whitcroft 	region_truncate(&resv_map->regions, 0);
40284afd99bSAndy Whitcroft 	kfree(resv_map);
40384afd99bSAndy Whitcroft }
40484afd99bSAndy Whitcroft 
40584afd99bSAndy Whitcroft static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
406a1e78772SMel Gorman {
407a1e78772SMel Gorman 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
408f83a275dSMel Gorman 	if (!(vma->vm_flags & VM_MAYSHARE))
40984afd99bSAndy Whitcroft 		return (struct resv_map *)(get_vma_private_data(vma) &
41084afd99bSAndy Whitcroft 							~HPAGE_RESV_MASK);
4112a4b3dedSHarvey Harrison 	return NULL;
412a1e78772SMel Gorman }
413a1e78772SMel Gorman 
41484afd99bSAndy Whitcroft static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
415a1e78772SMel Gorman {
416a1e78772SMel Gorman 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
417f83a275dSMel Gorman 	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
418a1e78772SMel Gorman 
41984afd99bSAndy Whitcroft 	set_vma_private_data(vma, (get_vma_private_data(vma) &
42084afd99bSAndy Whitcroft 				HPAGE_RESV_MASK) | (unsigned long)map);
42104f2cbe3SMel Gorman }
42204f2cbe3SMel Gorman 
42304f2cbe3SMel Gorman static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
42404f2cbe3SMel Gorman {
42504f2cbe3SMel Gorman 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
426f83a275dSMel Gorman 	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
427e7c4b0bfSAndy Whitcroft 
428e7c4b0bfSAndy Whitcroft 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
42904f2cbe3SMel Gorman }
43004f2cbe3SMel Gorman 
43104f2cbe3SMel Gorman static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
43204f2cbe3SMel Gorman {
43304f2cbe3SMel Gorman 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
434e7c4b0bfSAndy Whitcroft 
435e7c4b0bfSAndy Whitcroft 	return (get_vma_private_data(vma) & flag) != 0;
436a1e78772SMel Gorman }
437a1e78772SMel Gorman 
43804f2cbe3SMel Gorman /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
439a1e78772SMel Gorman void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
440a1e78772SMel Gorman {
441a1e78772SMel Gorman 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
442f83a275dSMel Gorman 	if (!(vma->vm_flags & VM_MAYSHARE))
443a1e78772SMel Gorman 		vma->vm_private_data = (void *)0;
444a1e78772SMel Gorman }
445a1e78772SMel Gorman 
446a1e78772SMel Gorman /* Returns true if the VMA has associated reserve pages */
447af0ed73eSJoonsoo Kim static int vma_has_reserves(struct vm_area_struct *vma, long chg)
448a1e78772SMel Gorman {
449af0ed73eSJoonsoo Kim 	if (vma->vm_flags & VM_NORESERVE) {
450af0ed73eSJoonsoo Kim 		/*
451af0ed73eSJoonsoo Kim 		 * This address is already reserved by other process(chg == 0),
452af0ed73eSJoonsoo Kim 		 * so, we should decrement reserved count. Without decrementing,
453af0ed73eSJoonsoo Kim 		 * reserve count remains after releasing inode, because this
454af0ed73eSJoonsoo Kim 		 * allocated page will go into page cache and is regarded as
455af0ed73eSJoonsoo Kim 		 * coming from reserved pool in releasing step.  Currently, we
456af0ed73eSJoonsoo Kim 		 * don't have any other solution to deal with this situation
457af0ed73eSJoonsoo Kim 		 * properly, so add work-around here.
458af0ed73eSJoonsoo Kim 		 */
459af0ed73eSJoonsoo Kim 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
460af0ed73eSJoonsoo Kim 			return 1;
461af0ed73eSJoonsoo Kim 		else
46272231b03SJoonsoo Kim 			return 0;
463af0ed73eSJoonsoo Kim 	}
464a63884e9SJoonsoo Kim 
465a63884e9SJoonsoo Kim 	/* Shared mappings always use reserves */
466f83a275dSMel Gorman 	if (vma->vm_flags & VM_MAYSHARE)
467a1e78772SMel Gorman 		return 1;
468a63884e9SJoonsoo Kim 
469a63884e9SJoonsoo Kim 	/*
470a63884e9SJoonsoo Kim 	 * Only the process that called mmap() has reserves for
471a63884e9SJoonsoo Kim 	 * private mappings.
472a63884e9SJoonsoo Kim 	 */
4737f09ca51SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4747f09ca51SMel Gorman 		return 1;
475a63884e9SJoonsoo Kim 
4767f09ca51SMel Gorman 	return 0;
477a1e78772SMel Gorman }
478a1e78772SMel Gorman 
4790ebabb41SNaoya Horiguchi static void copy_gigantic_page(struct page *dst, struct page *src)
4800ebabb41SNaoya Horiguchi {
4810ebabb41SNaoya Horiguchi 	int i;
4820ebabb41SNaoya Horiguchi 	struct hstate *h = page_hstate(src);
4830ebabb41SNaoya Horiguchi 	struct page *dst_base = dst;
4840ebabb41SNaoya Horiguchi 	struct page *src_base = src;
4850ebabb41SNaoya Horiguchi 
4860ebabb41SNaoya Horiguchi 	for (i = 0; i < pages_per_huge_page(h); ) {
4870ebabb41SNaoya Horiguchi 		cond_resched();
4880ebabb41SNaoya Horiguchi 		copy_highpage(dst, src);
4890ebabb41SNaoya Horiguchi 
4900ebabb41SNaoya Horiguchi 		i++;
4910ebabb41SNaoya Horiguchi 		dst = mem_map_next(dst, dst_base, i);
4920ebabb41SNaoya Horiguchi 		src = mem_map_next(src, src_base, i);
4930ebabb41SNaoya Horiguchi 	}
4940ebabb41SNaoya Horiguchi }
4950ebabb41SNaoya Horiguchi 
4960ebabb41SNaoya Horiguchi void copy_huge_page(struct page *dst, struct page *src)
4970ebabb41SNaoya Horiguchi {
4980ebabb41SNaoya Horiguchi 	int i;
4990ebabb41SNaoya Horiguchi 	struct hstate *h = page_hstate(src);
5000ebabb41SNaoya Horiguchi 
5010ebabb41SNaoya Horiguchi 	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
5020ebabb41SNaoya Horiguchi 		copy_gigantic_page(dst, src);
5030ebabb41SNaoya Horiguchi 		return;
5040ebabb41SNaoya Horiguchi 	}
5050ebabb41SNaoya Horiguchi 
5060ebabb41SNaoya Horiguchi 	might_sleep();
5070ebabb41SNaoya Horiguchi 	for (i = 0; i < pages_per_huge_page(h); i++) {
5080ebabb41SNaoya Horiguchi 		cond_resched();
5090ebabb41SNaoya Horiguchi 		copy_highpage(dst + i, src + i);
5100ebabb41SNaoya Horiguchi 	}
5110ebabb41SNaoya Horiguchi }
5120ebabb41SNaoya Horiguchi 
513a5516438SAndi Kleen static void enqueue_huge_page(struct hstate *h, struct page *page)
5141da177e4SLinus Torvalds {
5151da177e4SLinus Torvalds 	int nid = page_to_nid(page);
5160edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_freelists[nid]);
517a5516438SAndi Kleen 	h->free_huge_pages++;
518a5516438SAndi Kleen 	h->free_huge_pages_node[nid]++;
5191da177e4SLinus Torvalds }
5201da177e4SLinus Torvalds 
521bf50bab2SNaoya Horiguchi static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
522bf50bab2SNaoya Horiguchi {
523bf50bab2SNaoya Horiguchi 	struct page *page;
524bf50bab2SNaoya Horiguchi 
525bf50bab2SNaoya Horiguchi 	if (list_empty(&h->hugepage_freelists[nid]))
526bf50bab2SNaoya Horiguchi 		return NULL;
527bf50bab2SNaoya Horiguchi 	page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
5280edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_activelist);
529a9869b83SNaoya Horiguchi 	set_page_refcounted(page);
530bf50bab2SNaoya Horiguchi 	h->free_huge_pages--;
531bf50bab2SNaoya Horiguchi 	h->free_huge_pages_node[nid]--;
532bf50bab2SNaoya Horiguchi 	return page;
533bf50bab2SNaoya Horiguchi }
534bf50bab2SNaoya Horiguchi 
535a5516438SAndi Kleen static struct page *dequeue_huge_page_vma(struct hstate *h,
536a5516438SAndi Kleen 				struct vm_area_struct *vma,
537af0ed73eSJoonsoo Kim 				unsigned long address, int avoid_reserve,
538af0ed73eSJoonsoo Kim 				long chg)
5391da177e4SLinus Torvalds {
540b1c12cbcSKonstantin Khlebnikov 	struct page *page = NULL;
541480eccf9SLee Schermerhorn 	struct mempolicy *mpol;
54219770b32SMel Gorman 	nodemask_t *nodemask;
543c0ff7453SMiao Xie 	struct zonelist *zonelist;
544dd1a239fSMel Gorman 	struct zone *zone;
545dd1a239fSMel Gorman 	struct zoneref *z;
546cc9a6c87SMel Gorman 	unsigned int cpuset_mems_cookie;
5471da177e4SLinus Torvalds 
548a1e78772SMel Gorman 	/*
549a1e78772SMel Gorman 	 * A child process with MAP_PRIVATE mappings created by their parent
550a1e78772SMel Gorman 	 * have no page reserves. This check ensures that reservations are
551a1e78772SMel Gorman 	 * not "stolen". The child may still get SIGKILLed
552a1e78772SMel Gorman 	 */
553af0ed73eSJoonsoo Kim 	if (!vma_has_reserves(vma, chg) &&
554a5516438SAndi Kleen 			h->free_huge_pages - h->resv_huge_pages == 0)
555c0ff7453SMiao Xie 		goto err;
556a1e78772SMel Gorman 
55704f2cbe3SMel Gorman 	/* If reserves cannot be used, ensure enough pages are in the pool */
558a5516438SAndi Kleen 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
5596eab04a8SJustin P. Mattock 		goto err;
56004f2cbe3SMel Gorman 
5619966c4bbSJoonsoo Kim retry_cpuset:
5629966c4bbSJoonsoo Kim 	cpuset_mems_cookie = get_mems_allowed();
5639966c4bbSJoonsoo Kim 	zonelist = huge_zonelist(vma, address,
5649966c4bbSJoonsoo Kim 					htlb_alloc_mask, &mpol, &nodemask);
5659966c4bbSJoonsoo Kim 
56619770b32SMel Gorman 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
56719770b32SMel Gorman 						MAX_NR_ZONES - 1, nodemask) {
568bf50bab2SNaoya Horiguchi 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
569bf50bab2SNaoya Horiguchi 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
570bf50bab2SNaoya Horiguchi 			if (page) {
571af0ed73eSJoonsoo Kim 				if (avoid_reserve)
572af0ed73eSJoonsoo Kim 					break;
573af0ed73eSJoonsoo Kim 				if (!vma_has_reserves(vma, chg))
574af0ed73eSJoonsoo Kim 					break;
575af0ed73eSJoonsoo Kim 
57607443a85SJoonsoo Kim 				SetPagePrivate(page);
577a63884e9SJoonsoo Kim 				h->resv_huge_pages--;
5785ab3ee7bSKen Chen 				break;
5791da177e4SLinus Torvalds 			}
5803abf7afdSAndrew Morton 		}
581bf50bab2SNaoya Horiguchi 	}
582cc9a6c87SMel Gorman 
583cc9a6c87SMel Gorman 	mpol_cond_put(mpol);
584cc9a6c87SMel Gorman 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
585cc9a6c87SMel Gorman 		goto retry_cpuset;
586cc9a6c87SMel Gorman 	return page;
587cc9a6c87SMel Gorman 
588c0ff7453SMiao Xie err:
589cc9a6c87SMel Gorman 	return NULL;
5901da177e4SLinus Torvalds }
5911da177e4SLinus Torvalds 
592a5516438SAndi Kleen static void update_and_free_page(struct hstate *h, struct page *page)
5936af2acb6SAdam Litke {
5946af2acb6SAdam Litke 	int i;
595a5516438SAndi Kleen 
59618229df5SAndy Whitcroft 	VM_BUG_ON(h->order >= MAX_ORDER);
59718229df5SAndy Whitcroft 
598a5516438SAndi Kleen 	h->nr_huge_pages--;
599a5516438SAndi Kleen 	h->nr_huge_pages_node[page_to_nid(page)]--;
600a5516438SAndi Kleen 	for (i = 0; i < pages_per_huge_page(h); i++) {
60132f84528SChris Forbes 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
60232f84528SChris Forbes 				1 << PG_referenced | 1 << PG_dirty |
60332f84528SChris Forbes 				1 << PG_active | 1 << PG_reserved |
6046af2acb6SAdam Litke 				1 << PG_private | 1 << PG_writeback);
6056af2acb6SAdam Litke 	}
6069dd540e2SAneesh Kumar K.V 	VM_BUG_ON(hugetlb_cgroup_from_page(page));
6076af2acb6SAdam Litke 	set_compound_page_dtor(page, NULL);
6086af2acb6SAdam Litke 	set_page_refcounted(page);
6097f2e9525SGerald Schaefer 	arch_release_hugepage(page);
610a5516438SAndi Kleen 	__free_pages(page, huge_page_order(h));
6116af2acb6SAdam Litke }
6126af2acb6SAdam Litke 
613e5ff2159SAndi Kleen struct hstate *size_to_hstate(unsigned long size)
614e5ff2159SAndi Kleen {
615e5ff2159SAndi Kleen 	struct hstate *h;
616e5ff2159SAndi Kleen 
617e5ff2159SAndi Kleen 	for_each_hstate(h) {
618e5ff2159SAndi Kleen 		if (huge_page_size(h) == size)
619e5ff2159SAndi Kleen 			return h;
620e5ff2159SAndi Kleen 	}
621e5ff2159SAndi Kleen 	return NULL;
622e5ff2159SAndi Kleen }
623e5ff2159SAndi Kleen 
62427a85ef1SDavid Gibson static void free_huge_page(struct page *page)
62527a85ef1SDavid Gibson {
626a5516438SAndi Kleen 	/*
627a5516438SAndi Kleen 	 * Can't pass hstate in here because it is called from the
628a5516438SAndi Kleen 	 * compound page destructor.
629a5516438SAndi Kleen 	 */
630e5ff2159SAndi Kleen 	struct hstate *h = page_hstate(page);
6317893d1d5SAdam Litke 	int nid = page_to_nid(page);
63290481622SDavid Gibson 	struct hugepage_subpool *spool =
63390481622SDavid Gibson 		(struct hugepage_subpool *)page_private(page);
63407443a85SJoonsoo Kim 	bool restore_reserve;
63527a85ef1SDavid Gibson 
636e5df70abSAndy Whitcroft 	set_page_private(page, 0);
63723be7468SMel Gorman 	page->mapping = NULL;
6387893d1d5SAdam Litke 	BUG_ON(page_count(page));
6390fe6e20bSNaoya Horiguchi 	BUG_ON(page_mapcount(page));
64007443a85SJoonsoo Kim 	restore_reserve = PagePrivate(page);
64127a85ef1SDavid Gibson 
64227a85ef1SDavid Gibson 	spin_lock(&hugetlb_lock);
6436d76dcf4SAneesh Kumar K.V 	hugetlb_cgroup_uncharge_page(hstate_index(h),
6446d76dcf4SAneesh Kumar K.V 				     pages_per_huge_page(h), page);
64507443a85SJoonsoo Kim 	if (restore_reserve)
64607443a85SJoonsoo Kim 		h->resv_huge_pages++;
64707443a85SJoonsoo Kim 
648aa888a74SAndi Kleen 	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
6490edaecfaSAneesh Kumar K.V 		/* remove the page from active list */
6500edaecfaSAneesh Kumar K.V 		list_del(&page->lru);
651a5516438SAndi Kleen 		update_and_free_page(h, page);
652a5516438SAndi Kleen 		h->surplus_huge_pages--;
653a5516438SAndi Kleen 		h->surplus_huge_pages_node[nid]--;
6547893d1d5SAdam Litke 	} else {
6555d3a551cSWill Deacon 		arch_clear_hugepage_flags(page);
656a5516438SAndi Kleen 		enqueue_huge_page(h, page);
6577893d1d5SAdam Litke 	}
65827a85ef1SDavid Gibson 	spin_unlock(&hugetlb_lock);
65990481622SDavid Gibson 	hugepage_subpool_put_pages(spool, 1);
66027a85ef1SDavid Gibson }
66127a85ef1SDavid Gibson 
662a5516438SAndi Kleen static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
663b7ba30c6SAndi Kleen {
6640edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&page->lru);
665b7ba30c6SAndi Kleen 	set_compound_page_dtor(page, free_huge_page);
666b7ba30c6SAndi Kleen 	spin_lock(&hugetlb_lock);
6679dd540e2SAneesh Kumar K.V 	set_hugetlb_cgroup(page, NULL);
668a5516438SAndi Kleen 	h->nr_huge_pages++;
669a5516438SAndi Kleen 	h->nr_huge_pages_node[nid]++;
670b7ba30c6SAndi Kleen 	spin_unlock(&hugetlb_lock);
671b7ba30c6SAndi Kleen 	put_page(page); /* free it into the hugepage allocator */
672b7ba30c6SAndi Kleen }
673b7ba30c6SAndi Kleen 
67420a0307cSWu Fengguang static void prep_compound_gigantic_page(struct page *page, unsigned long order)
67520a0307cSWu Fengguang {
67620a0307cSWu Fengguang 	int i;
67720a0307cSWu Fengguang 	int nr_pages = 1 << order;
67820a0307cSWu Fengguang 	struct page *p = page + 1;
67920a0307cSWu Fengguang 
68020a0307cSWu Fengguang 	/* we rely on prep_new_huge_page to set the destructor */
68120a0307cSWu Fengguang 	set_compound_order(page, order);
68220a0307cSWu Fengguang 	__SetPageHead(page);
68320a0307cSWu Fengguang 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
68420a0307cSWu Fengguang 		__SetPageTail(p);
68558a84aa9SYouquan Song 		set_page_count(p, 0);
68620a0307cSWu Fengguang 		p->first_page = page;
68720a0307cSWu Fengguang 	}
68820a0307cSWu Fengguang }
68920a0307cSWu Fengguang 
6907795912cSAndrew Morton /*
6917795912cSAndrew Morton  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
6927795912cSAndrew Morton  * transparent huge pages.  See the PageTransHuge() documentation for more
6937795912cSAndrew Morton  * details.
6947795912cSAndrew Morton  */
69520a0307cSWu Fengguang int PageHuge(struct page *page)
69620a0307cSWu Fengguang {
69720a0307cSWu Fengguang 	compound_page_dtor *dtor;
69820a0307cSWu Fengguang 
69920a0307cSWu Fengguang 	if (!PageCompound(page))
70020a0307cSWu Fengguang 		return 0;
70120a0307cSWu Fengguang 
70220a0307cSWu Fengguang 	page = compound_head(page);
70320a0307cSWu Fengguang 	dtor = get_compound_page_dtor(page);
70420a0307cSWu Fengguang 
70520a0307cSWu Fengguang 	return dtor == free_huge_page;
70620a0307cSWu Fengguang }
70743131e14SNaoya Horiguchi EXPORT_SYMBOL_GPL(PageHuge);
70843131e14SNaoya Horiguchi 
70913d60f4bSZhang Yi pgoff_t __basepage_index(struct page *page)
71013d60f4bSZhang Yi {
71113d60f4bSZhang Yi 	struct page *page_head = compound_head(page);
71213d60f4bSZhang Yi 	pgoff_t index = page_index(page_head);
71313d60f4bSZhang Yi 	unsigned long compound_idx;
71413d60f4bSZhang Yi 
71513d60f4bSZhang Yi 	if (!PageHuge(page_head))
71613d60f4bSZhang Yi 		return page_index(page);
71713d60f4bSZhang Yi 
71813d60f4bSZhang Yi 	if (compound_order(page_head) >= MAX_ORDER)
71913d60f4bSZhang Yi 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
72013d60f4bSZhang Yi 	else
72113d60f4bSZhang Yi 		compound_idx = page - page_head;
72213d60f4bSZhang Yi 
72313d60f4bSZhang Yi 	return (index << compound_order(page_head)) + compound_idx;
72413d60f4bSZhang Yi }
72513d60f4bSZhang Yi 
726a5516438SAndi Kleen static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
7271da177e4SLinus Torvalds {
7281da177e4SLinus Torvalds 	struct page *page;
729f96efd58SJoe Jin 
730aa888a74SAndi Kleen 	if (h->order >= MAX_ORDER)
731aa888a74SAndi Kleen 		return NULL;
732aa888a74SAndi Kleen 
7336484eb3eSMel Gorman 	page = alloc_pages_exact_node(nid,
734551883aeSNishanth Aravamudan 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
735551883aeSNishanth Aravamudan 						__GFP_REPEAT|__GFP_NOWARN,
736a5516438SAndi Kleen 		huge_page_order(h));
7371da177e4SLinus Torvalds 	if (page) {
7387f2e9525SGerald Schaefer 		if (arch_prepare_hugepage(page)) {
739caff3a2cSGerald Schaefer 			__free_pages(page, huge_page_order(h));
7407b8ee84dSHarvey Harrison 			return NULL;
7417f2e9525SGerald Schaefer 		}
742a5516438SAndi Kleen 		prep_new_huge_page(h, page, nid);
7431da177e4SLinus Torvalds 	}
74463b4613cSNishanth Aravamudan 
74563b4613cSNishanth Aravamudan 	return page;
74663b4613cSNishanth Aravamudan }
74763b4613cSNishanth Aravamudan 
7485ced66c9SAndi Kleen /*
7496ae11b27SLee Schermerhorn  * common helper functions for hstate_next_node_to_{alloc|free}.
7506ae11b27SLee Schermerhorn  * We may have allocated or freed a huge page based on a different
7516ae11b27SLee Schermerhorn  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
7526ae11b27SLee Schermerhorn  * be outside of *nodes_allowed.  Ensure that we use an allowed
7536ae11b27SLee Schermerhorn  * node for alloc or free.
7549a76db09SLee Schermerhorn  */
7556ae11b27SLee Schermerhorn static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
7569a76db09SLee Schermerhorn {
7576ae11b27SLee Schermerhorn 	nid = next_node(nid, *nodes_allowed);
7589a76db09SLee Schermerhorn 	if (nid == MAX_NUMNODES)
7596ae11b27SLee Schermerhorn 		nid = first_node(*nodes_allowed);
7609a76db09SLee Schermerhorn 	VM_BUG_ON(nid >= MAX_NUMNODES);
7619a76db09SLee Schermerhorn 
7629a76db09SLee Schermerhorn 	return nid;
7639a76db09SLee Schermerhorn }
7649a76db09SLee Schermerhorn 
7656ae11b27SLee Schermerhorn static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
7665ced66c9SAndi Kleen {
7676ae11b27SLee Schermerhorn 	if (!node_isset(nid, *nodes_allowed))
7686ae11b27SLee Schermerhorn 		nid = next_node_allowed(nid, nodes_allowed);
7699a76db09SLee Schermerhorn 	return nid;
7705ced66c9SAndi Kleen }
7715ced66c9SAndi Kleen 
7726ae11b27SLee Schermerhorn /*
7736ae11b27SLee Schermerhorn  * returns the previously saved node ["this node"] from which to
7746ae11b27SLee Schermerhorn  * allocate a persistent huge page for the pool and advance the
7756ae11b27SLee Schermerhorn  * next node from which to allocate, handling wrap at end of node
7766ae11b27SLee Schermerhorn  * mask.
7776ae11b27SLee Schermerhorn  */
7786ae11b27SLee Schermerhorn static int hstate_next_node_to_alloc(struct hstate *h,
7796ae11b27SLee Schermerhorn 					nodemask_t *nodes_allowed)
7806ae11b27SLee Schermerhorn {
7816ae11b27SLee Schermerhorn 	int nid;
7826ae11b27SLee Schermerhorn 
7836ae11b27SLee Schermerhorn 	VM_BUG_ON(!nodes_allowed);
7846ae11b27SLee Schermerhorn 
7856ae11b27SLee Schermerhorn 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
7866ae11b27SLee Schermerhorn 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
7876ae11b27SLee Schermerhorn 
7886ae11b27SLee Schermerhorn 	return nid;
7896ae11b27SLee Schermerhorn }
7906ae11b27SLee Schermerhorn 
791e8c5c824SLee Schermerhorn /*
7926ae11b27SLee Schermerhorn  * helper for free_pool_huge_page() - return the previously saved
7936ae11b27SLee Schermerhorn  * node ["this node"] from which to free a huge page.  Advance the
7946ae11b27SLee Schermerhorn  * next node id whether or not we find a free huge page to free so
7956ae11b27SLee Schermerhorn  * that the next attempt to free addresses the next node.
796e8c5c824SLee Schermerhorn  */
7976ae11b27SLee Schermerhorn static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
798e8c5c824SLee Schermerhorn {
7996ae11b27SLee Schermerhorn 	int nid;
8009a76db09SLee Schermerhorn 
8016ae11b27SLee Schermerhorn 	VM_BUG_ON(!nodes_allowed);
8026ae11b27SLee Schermerhorn 
8036ae11b27SLee Schermerhorn 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
8046ae11b27SLee Schermerhorn 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
8056ae11b27SLee Schermerhorn 
8069a76db09SLee Schermerhorn 	return nid;
807e8c5c824SLee Schermerhorn }
808e8c5c824SLee Schermerhorn 
809b2261026SJoonsoo Kim #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
810b2261026SJoonsoo Kim 	for (nr_nodes = nodes_weight(*mask);				\
811b2261026SJoonsoo Kim 		nr_nodes > 0 &&						\
812b2261026SJoonsoo Kim 		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
813b2261026SJoonsoo Kim 		nr_nodes--)
814b2261026SJoonsoo Kim 
815b2261026SJoonsoo Kim #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
816b2261026SJoonsoo Kim 	for (nr_nodes = nodes_weight(*mask);				\
817b2261026SJoonsoo Kim 		nr_nodes > 0 &&						\
818b2261026SJoonsoo Kim 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
819b2261026SJoonsoo Kim 		nr_nodes--)
820b2261026SJoonsoo Kim 
821b2261026SJoonsoo Kim static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
822b2261026SJoonsoo Kim {
823b2261026SJoonsoo Kim 	struct page *page;
824b2261026SJoonsoo Kim 	int nr_nodes, node;
825b2261026SJoonsoo Kim 	int ret = 0;
826b2261026SJoonsoo Kim 
827b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
828b2261026SJoonsoo Kim 		page = alloc_fresh_huge_page_node(h, node);
829b2261026SJoonsoo Kim 		if (page) {
830b2261026SJoonsoo Kim 			ret = 1;
831b2261026SJoonsoo Kim 			break;
832b2261026SJoonsoo Kim 		}
833b2261026SJoonsoo Kim 	}
834b2261026SJoonsoo Kim 
835b2261026SJoonsoo Kim 	if (ret)
836b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC);
837b2261026SJoonsoo Kim 	else
838b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
839b2261026SJoonsoo Kim 
840b2261026SJoonsoo Kim 	return ret;
841b2261026SJoonsoo Kim }
842b2261026SJoonsoo Kim 
843e8c5c824SLee Schermerhorn /*
844e8c5c824SLee Schermerhorn  * Free huge page from pool from next node to free.
845e8c5c824SLee Schermerhorn  * Attempt to keep persistent huge pages more or less
846e8c5c824SLee Schermerhorn  * balanced over allowed nodes.
847e8c5c824SLee Schermerhorn  * Called with hugetlb_lock locked.
848e8c5c824SLee Schermerhorn  */
8496ae11b27SLee Schermerhorn static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
8506ae11b27SLee Schermerhorn 							 bool acct_surplus)
851e8c5c824SLee Schermerhorn {
852b2261026SJoonsoo Kim 	int nr_nodes, node;
853e8c5c824SLee Schermerhorn 	int ret = 0;
854e8c5c824SLee Schermerhorn 
855b2261026SJoonsoo Kim 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
856685f3457SLee Schermerhorn 		/*
857685f3457SLee Schermerhorn 		 * If we're returning unused surplus pages, only examine
858685f3457SLee Schermerhorn 		 * nodes with surplus pages.
859685f3457SLee Schermerhorn 		 */
860b2261026SJoonsoo Kim 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
861b2261026SJoonsoo Kim 		    !list_empty(&h->hugepage_freelists[node])) {
862e8c5c824SLee Schermerhorn 			struct page *page =
863b2261026SJoonsoo Kim 				list_entry(h->hugepage_freelists[node].next,
864e8c5c824SLee Schermerhorn 					  struct page, lru);
865e8c5c824SLee Schermerhorn 			list_del(&page->lru);
866e8c5c824SLee Schermerhorn 			h->free_huge_pages--;
867b2261026SJoonsoo Kim 			h->free_huge_pages_node[node]--;
868685f3457SLee Schermerhorn 			if (acct_surplus) {
869685f3457SLee Schermerhorn 				h->surplus_huge_pages--;
870b2261026SJoonsoo Kim 				h->surplus_huge_pages_node[node]--;
871685f3457SLee Schermerhorn 			}
872e8c5c824SLee Schermerhorn 			update_and_free_page(h, page);
873e8c5c824SLee Schermerhorn 			ret = 1;
8749a76db09SLee Schermerhorn 			break;
875e8c5c824SLee Schermerhorn 		}
876b2261026SJoonsoo Kim 	}
877e8c5c824SLee Schermerhorn 
878e8c5c824SLee Schermerhorn 	return ret;
879e8c5c824SLee Schermerhorn }
880e8c5c824SLee Schermerhorn 
881bf50bab2SNaoya Horiguchi static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
8827893d1d5SAdam Litke {
8837893d1d5SAdam Litke 	struct page *page;
884bf50bab2SNaoya Horiguchi 	unsigned int r_nid;
8857893d1d5SAdam Litke 
886aa888a74SAndi Kleen 	if (h->order >= MAX_ORDER)
887aa888a74SAndi Kleen 		return NULL;
888aa888a74SAndi Kleen 
889d1c3fb1fSNishanth Aravamudan 	/*
890d1c3fb1fSNishanth Aravamudan 	 * Assume we will successfully allocate the surplus page to
891d1c3fb1fSNishanth Aravamudan 	 * prevent racing processes from causing the surplus to exceed
892d1c3fb1fSNishanth Aravamudan 	 * overcommit
893d1c3fb1fSNishanth Aravamudan 	 *
894d1c3fb1fSNishanth Aravamudan 	 * This however introduces a different race, where a process B
895d1c3fb1fSNishanth Aravamudan 	 * tries to grow the static hugepage pool while alloc_pages() is
896d1c3fb1fSNishanth Aravamudan 	 * called by process A. B will only examine the per-node
897d1c3fb1fSNishanth Aravamudan 	 * counters in determining if surplus huge pages can be
898d1c3fb1fSNishanth Aravamudan 	 * converted to normal huge pages in adjust_pool_surplus(). A
899d1c3fb1fSNishanth Aravamudan 	 * won't be able to increment the per-node counter, until the
900d1c3fb1fSNishanth Aravamudan 	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
901d1c3fb1fSNishanth Aravamudan 	 * no more huge pages can be converted from surplus to normal
902d1c3fb1fSNishanth Aravamudan 	 * state (and doesn't try to convert again). Thus, we have a
903d1c3fb1fSNishanth Aravamudan 	 * case where a surplus huge page exists, the pool is grown, and
904d1c3fb1fSNishanth Aravamudan 	 * the surplus huge page still exists after, even though it
905d1c3fb1fSNishanth Aravamudan 	 * should just have been converted to a normal huge page. This
906d1c3fb1fSNishanth Aravamudan 	 * does not leak memory, though, as the hugepage will be freed
907d1c3fb1fSNishanth Aravamudan 	 * once it is out of use. It also does not allow the counters to
908d1c3fb1fSNishanth Aravamudan 	 * go out of whack in adjust_pool_surplus() as we don't modify
909d1c3fb1fSNishanth Aravamudan 	 * the node values until we've gotten the hugepage and only the
910d1c3fb1fSNishanth Aravamudan 	 * per-node value is checked there.
911d1c3fb1fSNishanth Aravamudan 	 */
912d1c3fb1fSNishanth Aravamudan 	spin_lock(&hugetlb_lock);
913a5516438SAndi Kleen 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
914d1c3fb1fSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
915d1c3fb1fSNishanth Aravamudan 		return NULL;
916d1c3fb1fSNishanth Aravamudan 	} else {
917a5516438SAndi Kleen 		h->nr_huge_pages++;
918a5516438SAndi Kleen 		h->surplus_huge_pages++;
919d1c3fb1fSNishanth Aravamudan 	}
920d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
921d1c3fb1fSNishanth Aravamudan 
922bf50bab2SNaoya Horiguchi 	if (nid == NUMA_NO_NODE)
923551883aeSNishanth Aravamudan 		page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
924551883aeSNishanth Aravamudan 				   __GFP_REPEAT|__GFP_NOWARN,
925a5516438SAndi Kleen 				   huge_page_order(h));
926bf50bab2SNaoya Horiguchi 	else
927bf50bab2SNaoya Horiguchi 		page = alloc_pages_exact_node(nid,
928bf50bab2SNaoya Horiguchi 			htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
929bf50bab2SNaoya Horiguchi 			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
930d1c3fb1fSNishanth Aravamudan 
931caff3a2cSGerald Schaefer 	if (page && arch_prepare_hugepage(page)) {
932caff3a2cSGerald Schaefer 		__free_pages(page, huge_page_order(h));
933ea5768c7SHillf Danton 		page = NULL;
934caff3a2cSGerald Schaefer 	}
935caff3a2cSGerald Schaefer 
9367893d1d5SAdam Litke 	spin_lock(&hugetlb_lock);
937d1c3fb1fSNishanth Aravamudan 	if (page) {
9380edaecfaSAneesh Kumar K.V 		INIT_LIST_HEAD(&page->lru);
939bf50bab2SNaoya Horiguchi 		r_nid = page_to_nid(page);
940d1c3fb1fSNishanth Aravamudan 		set_compound_page_dtor(page, free_huge_page);
9419dd540e2SAneesh Kumar K.V 		set_hugetlb_cgroup(page, NULL);
942d1c3fb1fSNishanth Aravamudan 		/*
943d1c3fb1fSNishanth Aravamudan 		 * We incremented the global counters already
944d1c3fb1fSNishanth Aravamudan 		 */
945bf50bab2SNaoya Horiguchi 		h->nr_huge_pages_node[r_nid]++;
946bf50bab2SNaoya Horiguchi 		h->surplus_huge_pages_node[r_nid]++;
9473b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC);
948d1c3fb1fSNishanth Aravamudan 	} else {
949a5516438SAndi Kleen 		h->nr_huge_pages--;
950a5516438SAndi Kleen 		h->surplus_huge_pages--;
9513b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
9527893d1d5SAdam Litke 	}
953d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
9547893d1d5SAdam Litke 
9557893d1d5SAdam Litke 	return page;
9567893d1d5SAdam Litke }
9577893d1d5SAdam Litke 
958e4e574b7SAdam Litke /*
959bf50bab2SNaoya Horiguchi  * This allocation function is useful in the context where vma is irrelevant.
960bf50bab2SNaoya Horiguchi  * E.g. soft-offlining uses this function because it only cares physical
961bf50bab2SNaoya Horiguchi  * address of error page.
962bf50bab2SNaoya Horiguchi  */
963bf50bab2SNaoya Horiguchi struct page *alloc_huge_page_node(struct hstate *h, int nid)
964bf50bab2SNaoya Horiguchi {
9654ef91848SJoonsoo Kim 	struct page *page = NULL;
966bf50bab2SNaoya Horiguchi 
967bf50bab2SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
9684ef91848SJoonsoo Kim 	if (h->free_huge_pages - h->resv_huge_pages > 0)
969bf50bab2SNaoya Horiguchi 		page = dequeue_huge_page_node(h, nid);
970bf50bab2SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
971bf50bab2SNaoya Horiguchi 
97294ae8ba7SAneesh Kumar K.V 	if (!page)
973bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, nid);
974bf50bab2SNaoya Horiguchi 
975bf50bab2SNaoya Horiguchi 	return page;
976bf50bab2SNaoya Horiguchi }
977bf50bab2SNaoya Horiguchi 
978bf50bab2SNaoya Horiguchi /*
97925985edcSLucas De Marchi  * Increase the hugetlb pool such that it can accommodate a reservation
980e4e574b7SAdam Litke  * of size 'delta'.
981e4e574b7SAdam Litke  */
982a5516438SAndi Kleen static int gather_surplus_pages(struct hstate *h, int delta)
983e4e574b7SAdam Litke {
984e4e574b7SAdam Litke 	struct list_head surplus_list;
985e4e574b7SAdam Litke 	struct page *page, *tmp;
986e4e574b7SAdam Litke 	int ret, i;
987e4e574b7SAdam Litke 	int needed, allocated;
98828073b02SHillf Danton 	bool alloc_ok = true;
989e4e574b7SAdam Litke 
990a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
991ac09b3a1SAdam Litke 	if (needed <= 0) {
992a5516438SAndi Kleen 		h->resv_huge_pages += delta;
993e4e574b7SAdam Litke 		return 0;
994ac09b3a1SAdam Litke 	}
995e4e574b7SAdam Litke 
996e4e574b7SAdam Litke 	allocated = 0;
997e4e574b7SAdam Litke 	INIT_LIST_HEAD(&surplus_list);
998e4e574b7SAdam Litke 
999e4e574b7SAdam Litke 	ret = -ENOMEM;
1000e4e574b7SAdam Litke retry:
1001e4e574b7SAdam Litke 	spin_unlock(&hugetlb_lock);
1002e4e574b7SAdam Litke 	for (i = 0; i < needed; i++) {
1003bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
100428073b02SHillf Danton 		if (!page) {
100528073b02SHillf Danton 			alloc_ok = false;
100628073b02SHillf Danton 			break;
100728073b02SHillf Danton 		}
1008e4e574b7SAdam Litke 		list_add(&page->lru, &surplus_list);
1009e4e574b7SAdam Litke 	}
101028073b02SHillf Danton 	allocated += i;
1011e4e574b7SAdam Litke 
1012e4e574b7SAdam Litke 	/*
1013e4e574b7SAdam Litke 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
1014e4e574b7SAdam Litke 	 * because either resv_huge_pages or free_huge_pages may have changed.
1015e4e574b7SAdam Litke 	 */
1016e4e574b7SAdam Litke 	spin_lock(&hugetlb_lock);
1017a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) -
1018a5516438SAndi Kleen 			(h->free_huge_pages + allocated);
101928073b02SHillf Danton 	if (needed > 0) {
102028073b02SHillf Danton 		if (alloc_ok)
1021e4e574b7SAdam Litke 			goto retry;
102228073b02SHillf Danton 		/*
102328073b02SHillf Danton 		 * We were not able to allocate enough pages to
102428073b02SHillf Danton 		 * satisfy the entire reservation so we free what
102528073b02SHillf Danton 		 * we've allocated so far.
102628073b02SHillf Danton 		 */
102728073b02SHillf Danton 		goto free;
102828073b02SHillf Danton 	}
1029e4e574b7SAdam Litke 	/*
1030e4e574b7SAdam Litke 	 * The surplus_list now contains _at_least_ the number of extra pages
103125985edcSLucas De Marchi 	 * needed to accommodate the reservation.  Add the appropriate number
1032e4e574b7SAdam Litke 	 * of pages to the hugetlb pool and free the extras back to the buddy
1033ac09b3a1SAdam Litke 	 * allocator.  Commit the entire reservation here to prevent another
1034ac09b3a1SAdam Litke 	 * process from stealing the pages as they are added to the pool but
1035ac09b3a1SAdam Litke 	 * before they are reserved.
1036e4e574b7SAdam Litke 	 */
1037e4e574b7SAdam Litke 	needed += allocated;
1038a5516438SAndi Kleen 	h->resv_huge_pages += delta;
1039e4e574b7SAdam Litke 	ret = 0;
1040a9869b83SNaoya Horiguchi 
104119fc3f0aSAdam Litke 	/* Free the needed pages to the hugetlb pool */
104219fc3f0aSAdam Litke 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
104319fc3f0aSAdam Litke 		if ((--needed) < 0)
104419fc3f0aSAdam Litke 			break;
1045a9869b83SNaoya Horiguchi 		/*
1046a9869b83SNaoya Horiguchi 		 * This page is now managed by the hugetlb allocator and has
1047a9869b83SNaoya Horiguchi 		 * no users -- drop the buddy allocator's reference.
1048a9869b83SNaoya Horiguchi 		 */
1049a9869b83SNaoya Horiguchi 		put_page_testzero(page);
1050a9869b83SNaoya Horiguchi 		VM_BUG_ON(page_count(page));
1051a5516438SAndi Kleen 		enqueue_huge_page(h, page);
105219fc3f0aSAdam Litke 	}
105328073b02SHillf Danton free:
1054b0365c8dSHillf Danton 	spin_unlock(&hugetlb_lock);
105519fc3f0aSAdam Litke 
105619fc3f0aSAdam Litke 	/* Free unnecessary surplus pages to the buddy allocator */
1057c0d934baSJoonsoo Kim 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1058a9869b83SNaoya Horiguchi 		put_page(page);
105919fc3f0aSAdam Litke 	spin_lock(&hugetlb_lock);
1060e4e574b7SAdam Litke 
1061e4e574b7SAdam Litke 	return ret;
1062e4e574b7SAdam Litke }
1063e4e574b7SAdam Litke 
1064e4e574b7SAdam Litke /*
1065e4e574b7SAdam Litke  * When releasing a hugetlb pool reservation, any surplus pages that were
1066e4e574b7SAdam Litke  * allocated to satisfy the reservation must be explicitly freed if they were
1067e4e574b7SAdam Litke  * never used.
1068685f3457SLee Schermerhorn  * Called with hugetlb_lock held.
1069e4e574b7SAdam Litke  */
1070a5516438SAndi Kleen static void return_unused_surplus_pages(struct hstate *h,
1071a5516438SAndi Kleen 					unsigned long unused_resv_pages)
1072e4e574b7SAdam Litke {
1073e4e574b7SAdam Litke 	unsigned long nr_pages;
1074e4e574b7SAdam Litke 
1075ac09b3a1SAdam Litke 	/* Uncommit the reservation */
1076a5516438SAndi Kleen 	h->resv_huge_pages -= unused_resv_pages;
1077ac09b3a1SAdam Litke 
1078aa888a74SAndi Kleen 	/* Cannot return gigantic pages currently */
1079aa888a74SAndi Kleen 	if (h->order >= MAX_ORDER)
1080aa888a74SAndi Kleen 		return;
1081aa888a74SAndi Kleen 
1082a5516438SAndi Kleen 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1083e4e574b7SAdam Litke 
1084685f3457SLee Schermerhorn 	/*
1085685f3457SLee Schermerhorn 	 * We want to release as many surplus pages as possible, spread
10869b5e5d0fSLee Schermerhorn 	 * evenly across all nodes with memory. Iterate across these nodes
10879b5e5d0fSLee Schermerhorn 	 * until we can no longer free unreserved surplus pages. This occurs
10889b5e5d0fSLee Schermerhorn 	 * when the nodes with surplus pages have no free pages.
10899b5e5d0fSLee Schermerhorn 	 * free_pool_huge_page() will balance the the freed pages across the
10909b5e5d0fSLee Schermerhorn 	 * on-line nodes with memory and will handle the hstate accounting.
1091685f3457SLee Schermerhorn 	 */
1092685f3457SLee Schermerhorn 	while (nr_pages--) {
10938cebfcd0SLai Jiangshan 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1094685f3457SLee Schermerhorn 			break;
1095e4e574b7SAdam Litke 	}
1096e4e574b7SAdam Litke }
1097e4e574b7SAdam Litke 
1098c37f9fb1SAndy Whitcroft /*
1099c37f9fb1SAndy Whitcroft  * Determine if the huge page at addr within the vma has an associated
1100c37f9fb1SAndy Whitcroft  * reservation.  Where it does not we will need to logically increase
110190481622SDavid Gibson  * reservation and actually increase subpool usage before an allocation
110290481622SDavid Gibson  * can occur.  Where any new reservation would be required the
110390481622SDavid Gibson  * reservation change is prepared, but not committed.  Once the page
110490481622SDavid Gibson  * has been allocated from the subpool and instantiated the change should
110590481622SDavid Gibson  * be committed via vma_commit_reservation.  No action is required on
110690481622SDavid Gibson  * failure.
1107c37f9fb1SAndy Whitcroft  */
1108e2f17d94SRoel Kluin static long vma_needs_reservation(struct hstate *h,
1109a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1110c37f9fb1SAndy Whitcroft {
1111c37f9fb1SAndy Whitcroft 	struct address_space *mapping = vma->vm_file->f_mapping;
1112c37f9fb1SAndy Whitcroft 	struct inode *inode = mapping->host;
1113c37f9fb1SAndy Whitcroft 
1114f83a275dSMel Gorman 	if (vma->vm_flags & VM_MAYSHARE) {
1115a5516438SAndi Kleen 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1116c37f9fb1SAndy Whitcroft 		return region_chg(&inode->i_mapping->private_list,
1117c37f9fb1SAndy Whitcroft 							idx, idx + 1);
1118c37f9fb1SAndy Whitcroft 
111984afd99bSAndy Whitcroft 	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1120c37f9fb1SAndy Whitcroft 		return 1;
1121c37f9fb1SAndy Whitcroft 
112284afd99bSAndy Whitcroft 	} else  {
1123e2f17d94SRoel Kluin 		long err;
1124a5516438SAndi Kleen 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1125f522c3acSJoonsoo Kim 		struct resv_map *resv = vma_resv_map(vma);
112684afd99bSAndy Whitcroft 
1127f522c3acSJoonsoo Kim 		err = region_chg(&resv->regions, idx, idx + 1);
112884afd99bSAndy Whitcroft 		if (err < 0)
112984afd99bSAndy Whitcroft 			return err;
1130c37f9fb1SAndy Whitcroft 		return 0;
1131c37f9fb1SAndy Whitcroft 	}
113284afd99bSAndy Whitcroft }
1133a5516438SAndi Kleen static void vma_commit_reservation(struct hstate *h,
1134a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1135c37f9fb1SAndy Whitcroft {
1136c37f9fb1SAndy Whitcroft 	struct address_space *mapping = vma->vm_file->f_mapping;
1137c37f9fb1SAndy Whitcroft 	struct inode *inode = mapping->host;
1138c37f9fb1SAndy Whitcroft 
1139f83a275dSMel Gorman 	if (vma->vm_flags & VM_MAYSHARE) {
1140a5516438SAndi Kleen 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1141c37f9fb1SAndy Whitcroft 		region_add(&inode->i_mapping->private_list, idx, idx + 1);
114284afd99bSAndy Whitcroft 
114384afd99bSAndy Whitcroft 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1144a5516438SAndi Kleen 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1145f522c3acSJoonsoo Kim 		struct resv_map *resv = vma_resv_map(vma);
114684afd99bSAndy Whitcroft 
114784afd99bSAndy Whitcroft 		/* Mark this page used in the map. */
1148f522c3acSJoonsoo Kim 		region_add(&resv->regions, idx, idx + 1);
1149c37f9fb1SAndy Whitcroft 	}
1150c37f9fb1SAndy Whitcroft }
1151c37f9fb1SAndy Whitcroft 
1152348ea204SAdam Litke static struct page *alloc_huge_page(struct vm_area_struct *vma,
115304f2cbe3SMel Gorman 				    unsigned long addr, int avoid_reserve)
1154348ea204SAdam Litke {
115590481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
1156a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
1157348ea204SAdam Litke 	struct page *page;
1158e2f17d94SRoel Kluin 	long chg;
11596d76dcf4SAneesh Kumar K.V 	int ret, idx;
11606d76dcf4SAneesh Kumar K.V 	struct hugetlb_cgroup *h_cg;
11612fc39cecSAdam Litke 
11626d76dcf4SAneesh Kumar K.V 	idx = hstate_index(h);
1163a1e78772SMel Gorman 	/*
116490481622SDavid Gibson 	 * Processes that did not create the mapping will have no
116590481622SDavid Gibson 	 * reserves and will not have accounted against subpool
116690481622SDavid Gibson 	 * limit. Check that the subpool limit can be made before
116790481622SDavid Gibson 	 * satisfying the allocation MAP_NORESERVE mappings may also
116890481622SDavid Gibson 	 * need pages and subpool limit allocated allocated if no reserve
116990481622SDavid Gibson 	 * mapping overlaps.
1170a1e78772SMel Gorman 	 */
1171a5516438SAndi Kleen 	chg = vma_needs_reservation(h, vma, addr);
1172c37f9fb1SAndy Whitcroft 	if (chg < 0)
117376dcee75SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
11748bb3f12eSJoonsoo Kim 	if (chg || avoid_reserve)
11758bb3f12eSJoonsoo Kim 		if (hugepage_subpool_get_pages(spool, 1))
117676dcee75SAneesh Kumar K.V 			return ERR_PTR(-ENOSPC);
117790d8b7e6SAdam Litke 
11786d76dcf4SAneesh Kumar K.V 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
11796d76dcf4SAneesh Kumar K.V 	if (ret) {
11808bb3f12eSJoonsoo Kim 		if (chg || avoid_reserve)
11818bb3f12eSJoonsoo Kim 			hugepage_subpool_put_pages(spool, 1);
11826d76dcf4SAneesh Kumar K.V 		return ERR_PTR(-ENOSPC);
11836d76dcf4SAneesh Kumar K.V 	}
1184a1e78772SMel Gorman 	spin_lock(&hugetlb_lock);
1185af0ed73eSJoonsoo Kim 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
118681a6fcaeSJoonsoo Kim 	if (!page) {
118794ae8ba7SAneesh Kumar K.V 		spin_unlock(&hugetlb_lock);
1188bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1189a1e78772SMel Gorman 		if (!page) {
11906d76dcf4SAneesh Kumar K.V 			hugetlb_cgroup_uncharge_cgroup(idx,
11916d76dcf4SAneesh Kumar K.V 						       pages_per_huge_page(h),
11926d76dcf4SAneesh Kumar K.V 						       h_cg);
11938bb3f12eSJoonsoo Kim 			if (chg || avoid_reserve)
11948bb3f12eSJoonsoo Kim 				hugepage_subpool_put_pages(spool, 1);
119576dcee75SAneesh Kumar K.V 			return ERR_PTR(-ENOSPC);
1196a1e78772SMel Gorman 		}
119779dbb236SAneesh Kumar K.V 		spin_lock(&hugetlb_lock);
119879dbb236SAneesh Kumar K.V 		list_move(&page->lru, &h->hugepage_activelist);
119981a6fcaeSJoonsoo Kim 		/* Fall through */
1200a1e78772SMel Gorman 	}
120181a6fcaeSJoonsoo Kim 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
120281a6fcaeSJoonsoo Kim 	spin_unlock(&hugetlb_lock);
1203a1e78772SMel Gorman 
120490481622SDavid Gibson 	set_page_private(page, (unsigned long)spool);
1205a1e78772SMel Gorman 
1206a5516438SAndi Kleen 	vma_commit_reservation(h, vma, addr);
12077893d1d5SAdam Litke 	return page;
1208b45b5bd6SDavid Gibson }
1209b45b5bd6SDavid Gibson 
121091f47662SCyrill Gorcunov int __weak alloc_bootmem_huge_page(struct hstate *h)
1211aa888a74SAndi Kleen {
1212aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
1213b2261026SJoonsoo Kim 	int nr_nodes, node;
1214aa888a74SAndi Kleen 
1215b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1216aa888a74SAndi Kleen 		void *addr;
1217aa888a74SAndi Kleen 
1218b2261026SJoonsoo Kim 		addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
1219aa888a74SAndi Kleen 				huge_page_size(h), huge_page_size(h), 0);
1220aa888a74SAndi Kleen 
1221aa888a74SAndi Kleen 		if (addr) {
1222aa888a74SAndi Kleen 			/*
1223aa888a74SAndi Kleen 			 * Use the beginning of the huge page to store the
1224aa888a74SAndi Kleen 			 * huge_bootmem_page struct (until gather_bootmem
1225aa888a74SAndi Kleen 			 * puts them into the mem_map).
1226aa888a74SAndi Kleen 			 */
1227aa888a74SAndi Kleen 			m = addr;
1228aa888a74SAndi Kleen 			goto found;
1229aa888a74SAndi Kleen 		}
1230aa888a74SAndi Kleen 	}
1231aa888a74SAndi Kleen 	return 0;
1232aa888a74SAndi Kleen 
1233aa888a74SAndi Kleen found:
1234aa888a74SAndi Kleen 	BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1235aa888a74SAndi Kleen 	/* Put them into a private list first because mem_map is not up yet */
1236aa888a74SAndi Kleen 	list_add(&m->list, &huge_boot_pages);
1237aa888a74SAndi Kleen 	m->hstate = h;
1238aa888a74SAndi Kleen 	return 1;
1239aa888a74SAndi Kleen }
1240aa888a74SAndi Kleen 
124118229df5SAndy Whitcroft static void prep_compound_huge_page(struct page *page, int order)
124218229df5SAndy Whitcroft {
124318229df5SAndy Whitcroft 	if (unlikely(order > (MAX_ORDER - 1)))
124418229df5SAndy Whitcroft 		prep_compound_gigantic_page(page, order);
124518229df5SAndy Whitcroft 	else
124618229df5SAndy Whitcroft 		prep_compound_page(page, order);
124718229df5SAndy Whitcroft }
124818229df5SAndy Whitcroft 
1249aa888a74SAndi Kleen /* Put bootmem huge pages into the standard lists after mem_map is up */
1250aa888a74SAndi Kleen static void __init gather_bootmem_prealloc(void)
1251aa888a74SAndi Kleen {
1252aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
1253aa888a74SAndi Kleen 
1254aa888a74SAndi Kleen 	list_for_each_entry(m, &huge_boot_pages, list) {
1255aa888a74SAndi Kleen 		struct hstate *h = m->hstate;
1256ee8f248dSBecky Bruce 		struct page *page;
1257ee8f248dSBecky Bruce 
1258ee8f248dSBecky Bruce #ifdef CONFIG_HIGHMEM
1259ee8f248dSBecky Bruce 		page = pfn_to_page(m->phys >> PAGE_SHIFT);
1260ee8f248dSBecky Bruce 		free_bootmem_late((unsigned long)m,
1261ee8f248dSBecky Bruce 				  sizeof(struct huge_bootmem_page));
1262ee8f248dSBecky Bruce #else
1263ee8f248dSBecky Bruce 		page = virt_to_page(m);
1264ee8f248dSBecky Bruce #endif
1265aa888a74SAndi Kleen 		__ClearPageReserved(page);
1266aa888a74SAndi Kleen 		WARN_ON(page_count(page) != 1);
126718229df5SAndy Whitcroft 		prep_compound_huge_page(page, h->order);
1268aa888a74SAndi Kleen 		prep_new_huge_page(h, page, page_to_nid(page));
1269b0320c7bSRafael Aquini 		/*
1270b0320c7bSRafael Aquini 		 * If we had gigantic hugepages allocated at boot time, we need
1271b0320c7bSRafael Aquini 		 * to restore the 'stolen' pages to totalram_pages in order to
1272b0320c7bSRafael Aquini 		 * fix confusing memory reports from free(1) and another
1273b0320c7bSRafael Aquini 		 * side-effects, like CommitLimit going negative.
1274b0320c7bSRafael Aquini 		 */
1275b0320c7bSRafael Aquini 		if (h->order > (MAX_ORDER - 1))
12763dcc0571SJiang Liu 			adjust_managed_page_count(page, 1 << h->order);
1277aa888a74SAndi Kleen 	}
1278aa888a74SAndi Kleen }
1279aa888a74SAndi Kleen 
12808faa8b07SAndi Kleen static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
12811da177e4SLinus Torvalds {
12821da177e4SLinus Torvalds 	unsigned long i;
12831da177e4SLinus Torvalds 
1284e5ff2159SAndi Kleen 	for (i = 0; i < h->max_huge_pages; ++i) {
1285aa888a74SAndi Kleen 		if (h->order >= MAX_ORDER) {
1286aa888a74SAndi Kleen 			if (!alloc_bootmem_huge_page(h))
1287aa888a74SAndi Kleen 				break;
12889b5e5d0fSLee Schermerhorn 		} else if (!alloc_fresh_huge_page(h,
12898cebfcd0SLai Jiangshan 					 &node_states[N_MEMORY]))
12901da177e4SLinus Torvalds 			break;
12911da177e4SLinus Torvalds 	}
12928faa8b07SAndi Kleen 	h->max_huge_pages = i;
1293e5ff2159SAndi Kleen }
1294e5ff2159SAndi Kleen 
1295e5ff2159SAndi Kleen static void __init hugetlb_init_hstates(void)
1296e5ff2159SAndi Kleen {
1297e5ff2159SAndi Kleen 	struct hstate *h;
1298e5ff2159SAndi Kleen 
1299e5ff2159SAndi Kleen 	for_each_hstate(h) {
13008faa8b07SAndi Kleen 		/* oversize hugepages were init'ed in early boot */
13018faa8b07SAndi Kleen 		if (h->order < MAX_ORDER)
13028faa8b07SAndi Kleen 			hugetlb_hstate_alloc_pages(h);
1303e5ff2159SAndi Kleen 	}
1304e5ff2159SAndi Kleen }
1305e5ff2159SAndi Kleen 
13064abd32dbSAndi Kleen static char * __init memfmt(char *buf, unsigned long n)
13074abd32dbSAndi Kleen {
13084abd32dbSAndi Kleen 	if (n >= (1UL << 30))
13094abd32dbSAndi Kleen 		sprintf(buf, "%lu GB", n >> 30);
13104abd32dbSAndi Kleen 	else if (n >= (1UL << 20))
13114abd32dbSAndi Kleen 		sprintf(buf, "%lu MB", n >> 20);
13124abd32dbSAndi Kleen 	else
13134abd32dbSAndi Kleen 		sprintf(buf, "%lu KB", n >> 10);
13144abd32dbSAndi Kleen 	return buf;
13154abd32dbSAndi Kleen }
13164abd32dbSAndi Kleen 
1317e5ff2159SAndi Kleen static void __init report_hugepages(void)
1318e5ff2159SAndi Kleen {
1319e5ff2159SAndi Kleen 	struct hstate *h;
1320e5ff2159SAndi Kleen 
1321e5ff2159SAndi Kleen 	for_each_hstate(h) {
13224abd32dbSAndi Kleen 		char buf[32];
1323ffb22af5SAndrew Morton 		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
13244abd32dbSAndi Kleen 			memfmt(buf, huge_page_size(h)),
13254abd32dbSAndi Kleen 			h->free_huge_pages);
1326e5ff2159SAndi Kleen 	}
1327e5ff2159SAndi Kleen }
1328e5ff2159SAndi Kleen 
13291da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
13306ae11b27SLee Schermerhorn static void try_to_free_low(struct hstate *h, unsigned long count,
13316ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
13321da177e4SLinus Torvalds {
13334415cc8dSChristoph Lameter 	int i;
13344415cc8dSChristoph Lameter 
1335aa888a74SAndi Kleen 	if (h->order >= MAX_ORDER)
1336aa888a74SAndi Kleen 		return;
1337aa888a74SAndi Kleen 
13386ae11b27SLee Schermerhorn 	for_each_node_mask(i, *nodes_allowed) {
13391da177e4SLinus Torvalds 		struct page *page, *next;
1340a5516438SAndi Kleen 		struct list_head *freel = &h->hugepage_freelists[i];
1341a5516438SAndi Kleen 		list_for_each_entry_safe(page, next, freel, lru) {
1342a5516438SAndi Kleen 			if (count >= h->nr_huge_pages)
13436b0c880dSAdam Litke 				return;
13441da177e4SLinus Torvalds 			if (PageHighMem(page))
13451da177e4SLinus Torvalds 				continue;
13461da177e4SLinus Torvalds 			list_del(&page->lru);
1347e5ff2159SAndi Kleen 			update_and_free_page(h, page);
1348a5516438SAndi Kleen 			h->free_huge_pages--;
1349a5516438SAndi Kleen 			h->free_huge_pages_node[page_to_nid(page)]--;
13501da177e4SLinus Torvalds 		}
13511da177e4SLinus Torvalds 	}
13521da177e4SLinus Torvalds }
13531da177e4SLinus Torvalds #else
13546ae11b27SLee Schermerhorn static inline void try_to_free_low(struct hstate *h, unsigned long count,
13556ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
13561da177e4SLinus Torvalds {
13571da177e4SLinus Torvalds }
13581da177e4SLinus Torvalds #endif
13591da177e4SLinus Torvalds 
136020a0307cSWu Fengguang /*
136120a0307cSWu Fengguang  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
136220a0307cSWu Fengguang  * balanced by operating on them in a round-robin fashion.
136320a0307cSWu Fengguang  * Returns 1 if an adjustment was made.
136420a0307cSWu Fengguang  */
13656ae11b27SLee Schermerhorn static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
13666ae11b27SLee Schermerhorn 				int delta)
136720a0307cSWu Fengguang {
1368b2261026SJoonsoo Kim 	int nr_nodes, node;
136920a0307cSWu Fengguang 
137020a0307cSWu Fengguang 	VM_BUG_ON(delta != -1 && delta != 1);
137120a0307cSWu Fengguang 
1372e8c5c824SLee Schermerhorn 	if (delta < 0) {
1373b2261026SJoonsoo Kim 		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1374b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node])
1375b2261026SJoonsoo Kim 				goto found;
1376b2261026SJoonsoo Kim 		}
1377b2261026SJoonsoo Kim 	} else {
1378b2261026SJoonsoo Kim 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1379b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node] <
1380b2261026SJoonsoo Kim 					h->nr_huge_pages_node[node])
1381b2261026SJoonsoo Kim 				goto found;
1382e8c5c824SLee Schermerhorn 		}
13839a76db09SLee Schermerhorn 	}
1384b2261026SJoonsoo Kim 	return 0;
138520a0307cSWu Fengguang 
1386b2261026SJoonsoo Kim found:
138720a0307cSWu Fengguang 	h->surplus_huge_pages += delta;
1388b2261026SJoonsoo Kim 	h->surplus_huge_pages_node[node] += delta;
1389b2261026SJoonsoo Kim 	return 1;
139020a0307cSWu Fengguang }
139120a0307cSWu Fengguang 
1392a5516438SAndi Kleen #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
13936ae11b27SLee Schermerhorn static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
13946ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
13951da177e4SLinus Torvalds {
13967893d1d5SAdam Litke 	unsigned long min_count, ret;
13971da177e4SLinus Torvalds 
1398aa888a74SAndi Kleen 	if (h->order >= MAX_ORDER)
1399aa888a74SAndi Kleen 		return h->max_huge_pages;
1400aa888a74SAndi Kleen 
14017893d1d5SAdam Litke 	/*
14027893d1d5SAdam Litke 	 * Increase the pool size
14037893d1d5SAdam Litke 	 * First take pages out of surplus state.  Then make up the
14047893d1d5SAdam Litke 	 * remaining difference by allocating fresh huge pages.
1405d1c3fb1fSNishanth Aravamudan 	 *
1406d1c3fb1fSNishanth Aravamudan 	 * We might race with alloc_buddy_huge_page() here and be unable
1407d1c3fb1fSNishanth Aravamudan 	 * to convert a surplus huge page to a normal huge page. That is
1408d1c3fb1fSNishanth Aravamudan 	 * not critical, though, it just means the overall size of the
1409d1c3fb1fSNishanth Aravamudan 	 * pool might be one hugepage larger than it needs to be, but
1410d1c3fb1fSNishanth Aravamudan 	 * within all the constraints specified by the sysctls.
14117893d1d5SAdam Litke 	 */
14121da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
1413a5516438SAndi Kleen 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
14146ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
14157893d1d5SAdam Litke 			break;
14167893d1d5SAdam Litke 	}
14177893d1d5SAdam Litke 
1418a5516438SAndi Kleen 	while (count > persistent_huge_pages(h)) {
14197893d1d5SAdam Litke 		/*
14207893d1d5SAdam Litke 		 * If this allocation races such that we no longer need the
14217893d1d5SAdam Litke 		 * page, free_huge_page will handle it by freeing the page
14227893d1d5SAdam Litke 		 * and reducing the surplus.
14237893d1d5SAdam Litke 		 */
14247893d1d5SAdam Litke 		spin_unlock(&hugetlb_lock);
14256ae11b27SLee Schermerhorn 		ret = alloc_fresh_huge_page(h, nodes_allowed);
14267893d1d5SAdam Litke 		spin_lock(&hugetlb_lock);
14277893d1d5SAdam Litke 		if (!ret)
14287893d1d5SAdam Litke 			goto out;
14297893d1d5SAdam Litke 
1430536240f2SMel Gorman 		/* Bail for signals. Probably ctrl-c from user */
1431536240f2SMel Gorman 		if (signal_pending(current))
1432536240f2SMel Gorman 			goto out;
14337893d1d5SAdam Litke 	}
14347893d1d5SAdam Litke 
14357893d1d5SAdam Litke 	/*
14367893d1d5SAdam Litke 	 * Decrease the pool size
14377893d1d5SAdam Litke 	 * First return free pages to the buddy allocator (being careful
14387893d1d5SAdam Litke 	 * to keep enough around to satisfy reservations).  Then place
14397893d1d5SAdam Litke 	 * pages into surplus state as needed so the pool will shrink
14407893d1d5SAdam Litke 	 * to the desired size as pages become free.
1441d1c3fb1fSNishanth Aravamudan 	 *
1442d1c3fb1fSNishanth Aravamudan 	 * By placing pages into the surplus state independent of the
1443d1c3fb1fSNishanth Aravamudan 	 * overcommit value, we are allowing the surplus pool size to
1444d1c3fb1fSNishanth Aravamudan 	 * exceed overcommit. There are few sane options here. Since
1445d1c3fb1fSNishanth Aravamudan 	 * alloc_buddy_huge_page() is checking the global counter,
1446d1c3fb1fSNishanth Aravamudan 	 * though, we'll note that we're not allowed to exceed surplus
1447d1c3fb1fSNishanth Aravamudan 	 * and won't grow the pool anywhere else. Not until one of the
1448d1c3fb1fSNishanth Aravamudan 	 * sysctls are changed, or the surplus pages go out of use.
14497893d1d5SAdam Litke 	 */
1450a5516438SAndi Kleen 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
14516b0c880dSAdam Litke 	min_count = max(count, min_count);
14526ae11b27SLee Schermerhorn 	try_to_free_low(h, min_count, nodes_allowed);
1453a5516438SAndi Kleen 	while (min_count < persistent_huge_pages(h)) {
14546ae11b27SLee Schermerhorn 		if (!free_pool_huge_page(h, nodes_allowed, 0))
14551da177e4SLinus Torvalds 			break;
14561da177e4SLinus Torvalds 	}
1457a5516438SAndi Kleen 	while (count < persistent_huge_pages(h)) {
14586ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
14597893d1d5SAdam Litke 			break;
14607893d1d5SAdam Litke 	}
14617893d1d5SAdam Litke out:
1462a5516438SAndi Kleen 	ret = persistent_huge_pages(h);
14631da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
14647893d1d5SAdam Litke 	return ret;
14651da177e4SLinus Torvalds }
14661da177e4SLinus Torvalds 
1467a3437870SNishanth Aravamudan #define HSTATE_ATTR_RO(_name) \
1468a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1469a3437870SNishanth Aravamudan 
1470a3437870SNishanth Aravamudan #define HSTATE_ATTR(_name) \
1471a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = \
1472a3437870SNishanth Aravamudan 		__ATTR(_name, 0644, _name##_show, _name##_store)
1473a3437870SNishanth Aravamudan 
1474a3437870SNishanth Aravamudan static struct kobject *hugepages_kobj;
1475a3437870SNishanth Aravamudan static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1476a3437870SNishanth Aravamudan 
14779a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
14789a305230SLee Schermerhorn 
14799a305230SLee Schermerhorn static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1480a3437870SNishanth Aravamudan {
1481a3437870SNishanth Aravamudan 	int i;
14829a305230SLee Schermerhorn 
1483a3437870SNishanth Aravamudan 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
14849a305230SLee Schermerhorn 		if (hstate_kobjs[i] == kobj) {
14859a305230SLee Schermerhorn 			if (nidp)
14869a305230SLee Schermerhorn 				*nidp = NUMA_NO_NODE;
1487a3437870SNishanth Aravamudan 			return &hstates[i];
14889a305230SLee Schermerhorn 		}
14899a305230SLee Schermerhorn 
14909a305230SLee Schermerhorn 	return kobj_to_node_hstate(kobj, nidp);
1491a3437870SNishanth Aravamudan }
1492a3437870SNishanth Aravamudan 
149306808b08SLee Schermerhorn static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1494a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1495a3437870SNishanth Aravamudan {
14969a305230SLee Schermerhorn 	struct hstate *h;
14979a305230SLee Schermerhorn 	unsigned long nr_huge_pages;
14989a305230SLee Schermerhorn 	int nid;
14999a305230SLee Schermerhorn 
15009a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
15019a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
15029a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages;
15039a305230SLee Schermerhorn 	else
15049a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages_node[nid];
15059a305230SLee Schermerhorn 
15069a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", nr_huge_pages);
1507a3437870SNishanth Aravamudan }
1508adbe8726SEric B Munson 
150906808b08SLee Schermerhorn static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
151006808b08SLee Schermerhorn 			struct kobject *kobj, struct kobj_attribute *attr,
151106808b08SLee Schermerhorn 			const char *buf, size_t len)
1512a3437870SNishanth Aravamudan {
1513a3437870SNishanth Aravamudan 	int err;
15149a305230SLee Schermerhorn 	int nid;
151506808b08SLee Schermerhorn 	unsigned long count;
15169a305230SLee Schermerhorn 	struct hstate *h;
1517bad44b5bSDavid Rientjes 	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1518a3437870SNishanth Aravamudan 
15193dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &count);
152073ae31e5SEric B Munson 	if (err)
1521adbe8726SEric B Munson 		goto out;
1522a3437870SNishanth Aravamudan 
15239a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
1524adbe8726SEric B Munson 	if (h->order >= MAX_ORDER) {
1525adbe8726SEric B Munson 		err = -EINVAL;
1526adbe8726SEric B Munson 		goto out;
1527adbe8726SEric B Munson 	}
1528adbe8726SEric B Munson 
15299a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE) {
15309a305230SLee Schermerhorn 		/*
15319a305230SLee Schermerhorn 		 * global hstate attribute
15329a305230SLee Schermerhorn 		 */
15339a305230SLee Schermerhorn 		if (!(obey_mempolicy &&
15349a305230SLee Schermerhorn 				init_nodemask_of_mempolicy(nodes_allowed))) {
153506808b08SLee Schermerhorn 			NODEMASK_FREE(nodes_allowed);
15368cebfcd0SLai Jiangshan 			nodes_allowed = &node_states[N_MEMORY];
153706808b08SLee Schermerhorn 		}
15389a305230SLee Schermerhorn 	} else if (nodes_allowed) {
15399a305230SLee Schermerhorn 		/*
15409a305230SLee Schermerhorn 		 * per node hstate attribute: adjust count to global,
15419a305230SLee Schermerhorn 		 * but restrict alloc/free to the specified node.
15429a305230SLee Schermerhorn 		 */
15439a305230SLee Schermerhorn 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
15449a305230SLee Schermerhorn 		init_nodemask_of_node(nodes_allowed, nid);
15459a305230SLee Schermerhorn 	} else
15468cebfcd0SLai Jiangshan 		nodes_allowed = &node_states[N_MEMORY];
15479a305230SLee Schermerhorn 
154806808b08SLee Schermerhorn 	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1549a3437870SNishanth Aravamudan 
15508cebfcd0SLai Jiangshan 	if (nodes_allowed != &node_states[N_MEMORY])
155106808b08SLee Schermerhorn 		NODEMASK_FREE(nodes_allowed);
155206808b08SLee Schermerhorn 
155306808b08SLee Schermerhorn 	return len;
1554adbe8726SEric B Munson out:
1555adbe8726SEric B Munson 	NODEMASK_FREE(nodes_allowed);
1556adbe8726SEric B Munson 	return err;
155706808b08SLee Schermerhorn }
155806808b08SLee Schermerhorn 
155906808b08SLee Schermerhorn static ssize_t nr_hugepages_show(struct kobject *kobj,
156006808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
156106808b08SLee Schermerhorn {
156206808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
156306808b08SLee Schermerhorn }
156406808b08SLee Schermerhorn 
156506808b08SLee Schermerhorn static ssize_t nr_hugepages_store(struct kobject *kobj,
156606808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
156706808b08SLee Schermerhorn {
156806808b08SLee Schermerhorn 	return nr_hugepages_store_common(false, kobj, attr, buf, len);
1569a3437870SNishanth Aravamudan }
1570a3437870SNishanth Aravamudan HSTATE_ATTR(nr_hugepages);
1571a3437870SNishanth Aravamudan 
157206808b08SLee Schermerhorn #ifdef CONFIG_NUMA
157306808b08SLee Schermerhorn 
157406808b08SLee Schermerhorn /*
157506808b08SLee Schermerhorn  * hstate attribute for optionally mempolicy-based constraint on persistent
157606808b08SLee Schermerhorn  * huge page alloc/free.
157706808b08SLee Schermerhorn  */
157806808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
157906808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
158006808b08SLee Schermerhorn {
158106808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
158206808b08SLee Schermerhorn }
158306808b08SLee Schermerhorn 
158406808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
158506808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
158606808b08SLee Schermerhorn {
158706808b08SLee Schermerhorn 	return nr_hugepages_store_common(true, kobj, attr, buf, len);
158806808b08SLee Schermerhorn }
158906808b08SLee Schermerhorn HSTATE_ATTR(nr_hugepages_mempolicy);
159006808b08SLee Schermerhorn #endif
159106808b08SLee Schermerhorn 
159206808b08SLee Schermerhorn 
1593a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1594a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1595a3437870SNishanth Aravamudan {
15969a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
1597a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1598a3437870SNishanth Aravamudan }
1599adbe8726SEric B Munson 
1600a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1601a3437870SNishanth Aravamudan 		struct kobj_attribute *attr, const char *buf, size_t count)
1602a3437870SNishanth Aravamudan {
1603a3437870SNishanth Aravamudan 	int err;
1604a3437870SNishanth Aravamudan 	unsigned long input;
16059a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
1606a3437870SNishanth Aravamudan 
1607adbe8726SEric B Munson 	if (h->order >= MAX_ORDER)
1608adbe8726SEric B Munson 		return -EINVAL;
1609adbe8726SEric B Munson 
16103dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &input);
1611a3437870SNishanth Aravamudan 	if (err)
161273ae31e5SEric B Munson 		return err;
1613a3437870SNishanth Aravamudan 
1614a3437870SNishanth Aravamudan 	spin_lock(&hugetlb_lock);
1615a3437870SNishanth Aravamudan 	h->nr_overcommit_huge_pages = input;
1616a3437870SNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
1617a3437870SNishanth Aravamudan 
1618a3437870SNishanth Aravamudan 	return count;
1619a3437870SNishanth Aravamudan }
1620a3437870SNishanth Aravamudan HSTATE_ATTR(nr_overcommit_hugepages);
1621a3437870SNishanth Aravamudan 
1622a3437870SNishanth Aravamudan static ssize_t free_hugepages_show(struct kobject *kobj,
1623a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1624a3437870SNishanth Aravamudan {
16259a305230SLee Schermerhorn 	struct hstate *h;
16269a305230SLee Schermerhorn 	unsigned long free_huge_pages;
16279a305230SLee Schermerhorn 	int nid;
16289a305230SLee Schermerhorn 
16299a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
16309a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
16319a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages;
16329a305230SLee Schermerhorn 	else
16339a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages_node[nid];
16349a305230SLee Schermerhorn 
16359a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", free_huge_pages);
1636a3437870SNishanth Aravamudan }
1637a3437870SNishanth Aravamudan HSTATE_ATTR_RO(free_hugepages);
1638a3437870SNishanth Aravamudan 
1639a3437870SNishanth Aravamudan static ssize_t resv_hugepages_show(struct kobject *kobj,
1640a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1641a3437870SNishanth Aravamudan {
16429a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
1643a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
1644a3437870SNishanth Aravamudan }
1645a3437870SNishanth Aravamudan HSTATE_ATTR_RO(resv_hugepages);
1646a3437870SNishanth Aravamudan 
1647a3437870SNishanth Aravamudan static ssize_t surplus_hugepages_show(struct kobject *kobj,
1648a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1649a3437870SNishanth Aravamudan {
16509a305230SLee Schermerhorn 	struct hstate *h;
16519a305230SLee Schermerhorn 	unsigned long surplus_huge_pages;
16529a305230SLee Schermerhorn 	int nid;
16539a305230SLee Schermerhorn 
16549a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
16559a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
16569a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages;
16579a305230SLee Schermerhorn 	else
16589a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
16599a305230SLee Schermerhorn 
16609a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", surplus_huge_pages);
1661a3437870SNishanth Aravamudan }
1662a3437870SNishanth Aravamudan HSTATE_ATTR_RO(surplus_hugepages);
1663a3437870SNishanth Aravamudan 
1664a3437870SNishanth Aravamudan static struct attribute *hstate_attrs[] = {
1665a3437870SNishanth Aravamudan 	&nr_hugepages_attr.attr,
1666a3437870SNishanth Aravamudan 	&nr_overcommit_hugepages_attr.attr,
1667a3437870SNishanth Aravamudan 	&free_hugepages_attr.attr,
1668a3437870SNishanth Aravamudan 	&resv_hugepages_attr.attr,
1669a3437870SNishanth Aravamudan 	&surplus_hugepages_attr.attr,
167006808b08SLee Schermerhorn #ifdef CONFIG_NUMA
167106808b08SLee Schermerhorn 	&nr_hugepages_mempolicy_attr.attr,
167206808b08SLee Schermerhorn #endif
1673a3437870SNishanth Aravamudan 	NULL,
1674a3437870SNishanth Aravamudan };
1675a3437870SNishanth Aravamudan 
1676a3437870SNishanth Aravamudan static struct attribute_group hstate_attr_group = {
1677a3437870SNishanth Aravamudan 	.attrs = hstate_attrs,
1678a3437870SNishanth Aravamudan };
1679a3437870SNishanth Aravamudan 
1680094e9539SJeff Mahoney static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
16819a305230SLee Schermerhorn 				    struct kobject **hstate_kobjs,
16829a305230SLee Schermerhorn 				    struct attribute_group *hstate_attr_group)
1683a3437870SNishanth Aravamudan {
1684a3437870SNishanth Aravamudan 	int retval;
1685972dc4deSAneesh Kumar K.V 	int hi = hstate_index(h);
1686a3437870SNishanth Aravamudan 
16879a305230SLee Schermerhorn 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
16889a305230SLee Schermerhorn 	if (!hstate_kobjs[hi])
1689a3437870SNishanth Aravamudan 		return -ENOMEM;
1690a3437870SNishanth Aravamudan 
16919a305230SLee Schermerhorn 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1692a3437870SNishanth Aravamudan 	if (retval)
16939a305230SLee Schermerhorn 		kobject_put(hstate_kobjs[hi]);
1694a3437870SNishanth Aravamudan 
1695a3437870SNishanth Aravamudan 	return retval;
1696a3437870SNishanth Aravamudan }
1697a3437870SNishanth Aravamudan 
1698a3437870SNishanth Aravamudan static void __init hugetlb_sysfs_init(void)
1699a3437870SNishanth Aravamudan {
1700a3437870SNishanth Aravamudan 	struct hstate *h;
1701a3437870SNishanth Aravamudan 	int err;
1702a3437870SNishanth Aravamudan 
1703a3437870SNishanth Aravamudan 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1704a3437870SNishanth Aravamudan 	if (!hugepages_kobj)
1705a3437870SNishanth Aravamudan 		return;
1706a3437870SNishanth Aravamudan 
1707a3437870SNishanth Aravamudan 	for_each_hstate(h) {
17089a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
17099a305230SLee Schermerhorn 					 hstate_kobjs, &hstate_attr_group);
1710a3437870SNishanth Aravamudan 		if (err)
1711ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s", h->name);
1712a3437870SNishanth Aravamudan 	}
1713a3437870SNishanth Aravamudan }
1714a3437870SNishanth Aravamudan 
17159a305230SLee Schermerhorn #ifdef CONFIG_NUMA
17169a305230SLee Schermerhorn 
17179a305230SLee Schermerhorn /*
17189a305230SLee Schermerhorn  * node_hstate/s - associate per node hstate attributes, via their kobjects,
171910fbcf4cSKay Sievers  * with node devices in node_devices[] using a parallel array.  The array
172010fbcf4cSKay Sievers  * index of a node device or _hstate == node id.
172110fbcf4cSKay Sievers  * This is here to avoid any static dependency of the node device driver, in
17229a305230SLee Schermerhorn  * the base kernel, on the hugetlb module.
17239a305230SLee Schermerhorn  */
17249a305230SLee Schermerhorn struct node_hstate {
17259a305230SLee Schermerhorn 	struct kobject		*hugepages_kobj;
17269a305230SLee Schermerhorn 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
17279a305230SLee Schermerhorn };
17289a305230SLee Schermerhorn struct node_hstate node_hstates[MAX_NUMNODES];
17299a305230SLee Schermerhorn 
17309a305230SLee Schermerhorn /*
173110fbcf4cSKay Sievers  * A subset of global hstate attributes for node devices
17329a305230SLee Schermerhorn  */
17339a305230SLee Schermerhorn static struct attribute *per_node_hstate_attrs[] = {
17349a305230SLee Schermerhorn 	&nr_hugepages_attr.attr,
17359a305230SLee Schermerhorn 	&free_hugepages_attr.attr,
17369a305230SLee Schermerhorn 	&surplus_hugepages_attr.attr,
17379a305230SLee Schermerhorn 	NULL,
17389a305230SLee Schermerhorn };
17399a305230SLee Schermerhorn 
17409a305230SLee Schermerhorn static struct attribute_group per_node_hstate_attr_group = {
17419a305230SLee Schermerhorn 	.attrs = per_node_hstate_attrs,
17429a305230SLee Schermerhorn };
17439a305230SLee Schermerhorn 
17449a305230SLee Schermerhorn /*
174510fbcf4cSKay Sievers  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
17469a305230SLee Schermerhorn  * Returns node id via non-NULL nidp.
17479a305230SLee Schermerhorn  */
17489a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
17499a305230SLee Schermerhorn {
17509a305230SLee Schermerhorn 	int nid;
17519a305230SLee Schermerhorn 
17529a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++) {
17539a305230SLee Schermerhorn 		struct node_hstate *nhs = &node_hstates[nid];
17549a305230SLee Schermerhorn 		int i;
17559a305230SLee Schermerhorn 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
17569a305230SLee Schermerhorn 			if (nhs->hstate_kobjs[i] == kobj) {
17579a305230SLee Schermerhorn 				if (nidp)
17589a305230SLee Schermerhorn 					*nidp = nid;
17599a305230SLee Schermerhorn 				return &hstates[i];
17609a305230SLee Schermerhorn 			}
17619a305230SLee Schermerhorn 	}
17629a305230SLee Schermerhorn 
17639a305230SLee Schermerhorn 	BUG();
17649a305230SLee Schermerhorn 	return NULL;
17659a305230SLee Schermerhorn }
17669a305230SLee Schermerhorn 
17679a305230SLee Schermerhorn /*
176810fbcf4cSKay Sievers  * Unregister hstate attributes from a single node device.
17699a305230SLee Schermerhorn  * No-op if no hstate attributes attached.
17709a305230SLee Schermerhorn  */
17713cd8b44fSClaudiu Ghioc static void hugetlb_unregister_node(struct node *node)
17729a305230SLee Schermerhorn {
17739a305230SLee Schermerhorn 	struct hstate *h;
177410fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
17759a305230SLee Schermerhorn 
17769a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
17779b5e5d0fSLee Schermerhorn 		return;		/* no hstate attributes */
17789a305230SLee Schermerhorn 
1779972dc4deSAneesh Kumar K.V 	for_each_hstate(h) {
1780972dc4deSAneesh Kumar K.V 		int idx = hstate_index(h);
1781972dc4deSAneesh Kumar K.V 		if (nhs->hstate_kobjs[idx]) {
1782972dc4deSAneesh Kumar K.V 			kobject_put(nhs->hstate_kobjs[idx]);
1783972dc4deSAneesh Kumar K.V 			nhs->hstate_kobjs[idx] = NULL;
1784972dc4deSAneesh Kumar K.V 		}
17859a305230SLee Schermerhorn 	}
17869a305230SLee Schermerhorn 
17879a305230SLee Schermerhorn 	kobject_put(nhs->hugepages_kobj);
17889a305230SLee Schermerhorn 	nhs->hugepages_kobj = NULL;
17899a305230SLee Schermerhorn }
17909a305230SLee Schermerhorn 
17919a305230SLee Schermerhorn /*
179210fbcf4cSKay Sievers  * hugetlb module exit:  unregister hstate attributes from node devices
17939a305230SLee Schermerhorn  * that have them.
17949a305230SLee Schermerhorn  */
17959a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void)
17969a305230SLee Schermerhorn {
17979a305230SLee Schermerhorn 	int nid;
17989a305230SLee Schermerhorn 
17999a305230SLee Schermerhorn 	/*
180010fbcf4cSKay Sievers 	 * disable node device registrations.
18019a305230SLee Schermerhorn 	 */
18029a305230SLee Schermerhorn 	register_hugetlbfs_with_node(NULL, NULL);
18039a305230SLee Schermerhorn 
18049a305230SLee Schermerhorn 	/*
18059a305230SLee Schermerhorn 	 * remove hstate attributes from any nodes that have them.
18069a305230SLee Schermerhorn 	 */
18079a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++)
18088732794bSWen Congyang 		hugetlb_unregister_node(node_devices[nid]);
18099a305230SLee Schermerhorn }
18109a305230SLee Schermerhorn 
18119a305230SLee Schermerhorn /*
181210fbcf4cSKay Sievers  * Register hstate attributes for a single node device.
18139a305230SLee Schermerhorn  * No-op if attributes already registered.
18149a305230SLee Schermerhorn  */
18153cd8b44fSClaudiu Ghioc static void hugetlb_register_node(struct node *node)
18169a305230SLee Schermerhorn {
18179a305230SLee Schermerhorn 	struct hstate *h;
181810fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
18199a305230SLee Schermerhorn 	int err;
18209a305230SLee Schermerhorn 
18219a305230SLee Schermerhorn 	if (nhs->hugepages_kobj)
18229a305230SLee Schermerhorn 		return;		/* already allocated */
18239a305230SLee Schermerhorn 
18249a305230SLee Schermerhorn 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
182510fbcf4cSKay Sievers 							&node->dev.kobj);
18269a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
18279a305230SLee Schermerhorn 		return;
18289a305230SLee Schermerhorn 
18299a305230SLee Schermerhorn 	for_each_hstate(h) {
18309a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
18319a305230SLee Schermerhorn 						nhs->hstate_kobjs,
18329a305230SLee Schermerhorn 						&per_node_hstate_attr_group);
18339a305230SLee Schermerhorn 		if (err) {
1834ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
183510fbcf4cSKay Sievers 				h->name, node->dev.id);
18369a305230SLee Schermerhorn 			hugetlb_unregister_node(node);
18379a305230SLee Schermerhorn 			break;
18389a305230SLee Schermerhorn 		}
18399a305230SLee Schermerhorn 	}
18409a305230SLee Schermerhorn }
18419a305230SLee Schermerhorn 
18429a305230SLee Schermerhorn /*
18439b5e5d0fSLee Schermerhorn  * hugetlb init time:  register hstate attributes for all registered node
184410fbcf4cSKay Sievers  * devices of nodes that have memory.  All on-line nodes should have
184510fbcf4cSKay Sievers  * registered their associated device by this time.
18469a305230SLee Schermerhorn  */
18479a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void)
18489a305230SLee Schermerhorn {
18499a305230SLee Schermerhorn 	int nid;
18509a305230SLee Schermerhorn 
18518cebfcd0SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
18528732794bSWen Congyang 		struct node *node = node_devices[nid];
185310fbcf4cSKay Sievers 		if (node->dev.id == nid)
18549a305230SLee Schermerhorn 			hugetlb_register_node(node);
18559a305230SLee Schermerhorn 	}
18569a305230SLee Schermerhorn 
18579a305230SLee Schermerhorn 	/*
185810fbcf4cSKay Sievers 	 * Let the node device driver know we're here so it can
18599a305230SLee Schermerhorn 	 * [un]register hstate attributes on node hotplug.
18609a305230SLee Schermerhorn 	 */
18619a305230SLee Schermerhorn 	register_hugetlbfs_with_node(hugetlb_register_node,
18629a305230SLee Schermerhorn 				     hugetlb_unregister_node);
18639a305230SLee Schermerhorn }
18649a305230SLee Schermerhorn #else	/* !CONFIG_NUMA */
18659a305230SLee Schermerhorn 
18669a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
18679a305230SLee Schermerhorn {
18689a305230SLee Schermerhorn 	BUG();
18699a305230SLee Schermerhorn 	if (nidp)
18709a305230SLee Schermerhorn 		*nidp = -1;
18719a305230SLee Schermerhorn 	return NULL;
18729a305230SLee Schermerhorn }
18739a305230SLee Schermerhorn 
18749a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void) { }
18759a305230SLee Schermerhorn 
18769a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) { }
18779a305230SLee Schermerhorn 
18789a305230SLee Schermerhorn #endif
18799a305230SLee Schermerhorn 
1880a3437870SNishanth Aravamudan static void __exit hugetlb_exit(void)
1881a3437870SNishanth Aravamudan {
1882a3437870SNishanth Aravamudan 	struct hstate *h;
1883a3437870SNishanth Aravamudan 
18849a305230SLee Schermerhorn 	hugetlb_unregister_all_nodes();
18859a305230SLee Schermerhorn 
1886a3437870SNishanth Aravamudan 	for_each_hstate(h) {
1887972dc4deSAneesh Kumar K.V 		kobject_put(hstate_kobjs[hstate_index(h)]);
1888a3437870SNishanth Aravamudan 	}
1889a3437870SNishanth Aravamudan 
1890a3437870SNishanth Aravamudan 	kobject_put(hugepages_kobj);
1891a3437870SNishanth Aravamudan }
1892a3437870SNishanth Aravamudan module_exit(hugetlb_exit);
1893a3437870SNishanth Aravamudan 
1894a3437870SNishanth Aravamudan static int __init hugetlb_init(void)
1895a3437870SNishanth Aravamudan {
18960ef89d25SBenjamin Herrenschmidt 	/* Some platform decide whether they support huge pages at boot
18970ef89d25SBenjamin Herrenschmidt 	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
18980ef89d25SBenjamin Herrenschmidt 	 * there is no such support
18990ef89d25SBenjamin Herrenschmidt 	 */
19000ef89d25SBenjamin Herrenschmidt 	if (HPAGE_SHIFT == 0)
19010ef89d25SBenjamin Herrenschmidt 		return 0;
1902a3437870SNishanth Aravamudan 
1903e11bfbfcSNick Piggin 	if (!size_to_hstate(default_hstate_size)) {
1904e11bfbfcSNick Piggin 		default_hstate_size = HPAGE_SIZE;
1905e11bfbfcSNick Piggin 		if (!size_to_hstate(default_hstate_size))
1906a3437870SNishanth Aravamudan 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1907a3437870SNishanth Aravamudan 	}
1908972dc4deSAneesh Kumar K.V 	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1909e11bfbfcSNick Piggin 	if (default_hstate_max_huge_pages)
1910e11bfbfcSNick Piggin 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1911a3437870SNishanth Aravamudan 
1912a3437870SNishanth Aravamudan 	hugetlb_init_hstates();
1913aa888a74SAndi Kleen 	gather_bootmem_prealloc();
1914a3437870SNishanth Aravamudan 	report_hugepages();
1915a3437870SNishanth Aravamudan 
1916a3437870SNishanth Aravamudan 	hugetlb_sysfs_init();
19179a305230SLee Schermerhorn 	hugetlb_register_all_nodes();
19187179e7bfSJianguo Wu 	hugetlb_cgroup_file_init();
19199a305230SLee Schermerhorn 
1920a3437870SNishanth Aravamudan 	return 0;
1921a3437870SNishanth Aravamudan }
1922a3437870SNishanth Aravamudan module_init(hugetlb_init);
1923a3437870SNishanth Aravamudan 
1924a3437870SNishanth Aravamudan /* Should be called on processing a hugepagesz=... option */
1925a3437870SNishanth Aravamudan void __init hugetlb_add_hstate(unsigned order)
1926a3437870SNishanth Aravamudan {
1927a3437870SNishanth Aravamudan 	struct hstate *h;
19288faa8b07SAndi Kleen 	unsigned long i;
19298faa8b07SAndi Kleen 
1930a3437870SNishanth Aravamudan 	if (size_to_hstate(PAGE_SIZE << order)) {
1931ffb22af5SAndrew Morton 		pr_warning("hugepagesz= specified twice, ignoring\n");
1932a3437870SNishanth Aravamudan 		return;
1933a3437870SNishanth Aravamudan 	}
193447d38344SAneesh Kumar K.V 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1935a3437870SNishanth Aravamudan 	BUG_ON(order == 0);
193647d38344SAneesh Kumar K.V 	h = &hstates[hugetlb_max_hstate++];
1937a3437870SNishanth Aravamudan 	h->order = order;
1938a3437870SNishanth Aravamudan 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
19398faa8b07SAndi Kleen 	h->nr_huge_pages = 0;
19408faa8b07SAndi Kleen 	h->free_huge_pages = 0;
19418faa8b07SAndi Kleen 	for (i = 0; i < MAX_NUMNODES; ++i)
19428faa8b07SAndi Kleen 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
19430edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&h->hugepage_activelist);
19448cebfcd0SLai Jiangshan 	h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
19458cebfcd0SLai Jiangshan 	h->next_nid_to_free = first_node(node_states[N_MEMORY]);
1946a3437870SNishanth Aravamudan 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1947a3437870SNishanth Aravamudan 					huge_page_size(h)/1024);
19488faa8b07SAndi Kleen 
1949a3437870SNishanth Aravamudan 	parsed_hstate = h;
1950a3437870SNishanth Aravamudan }
1951a3437870SNishanth Aravamudan 
1952e11bfbfcSNick Piggin static int __init hugetlb_nrpages_setup(char *s)
1953a3437870SNishanth Aravamudan {
1954a3437870SNishanth Aravamudan 	unsigned long *mhp;
19558faa8b07SAndi Kleen 	static unsigned long *last_mhp;
1956a3437870SNishanth Aravamudan 
1957a3437870SNishanth Aravamudan 	/*
195847d38344SAneesh Kumar K.V 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1959a3437870SNishanth Aravamudan 	 * so this hugepages= parameter goes to the "default hstate".
1960a3437870SNishanth Aravamudan 	 */
196147d38344SAneesh Kumar K.V 	if (!hugetlb_max_hstate)
1962a3437870SNishanth Aravamudan 		mhp = &default_hstate_max_huge_pages;
1963a3437870SNishanth Aravamudan 	else
1964a3437870SNishanth Aravamudan 		mhp = &parsed_hstate->max_huge_pages;
1965a3437870SNishanth Aravamudan 
19668faa8b07SAndi Kleen 	if (mhp == last_mhp) {
1967ffb22af5SAndrew Morton 		pr_warning("hugepages= specified twice without "
19688faa8b07SAndi Kleen 			   "interleaving hugepagesz=, ignoring\n");
19698faa8b07SAndi Kleen 		return 1;
19708faa8b07SAndi Kleen 	}
19718faa8b07SAndi Kleen 
1972a3437870SNishanth Aravamudan 	if (sscanf(s, "%lu", mhp) <= 0)
1973a3437870SNishanth Aravamudan 		*mhp = 0;
1974a3437870SNishanth Aravamudan 
19758faa8b07SAndi Kleen 	/*
19768faa8b07SAndi Kleen 	 * Global state is always initialized later in hugetlb_init.
19778faa8b07SAndi Kleen 	 * But we need to allocate >= MAX_ORDER hstates here early to still
19788faa8b07SAndi Kleen 	 * use the bootmem allocator.
19798faa8b07SAndi Kleen 	 */
198047d38344SAneesh Kumar K.V 	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
19818faa8b07SAndi Kleen 		hugetlb_hstate_alloc_pages(parsed_hstate);
19828faa8b07SAndi Kleen 
19838faa8b07SAndi Kleen 	last_mhp = mhp;
19848faa8b07SAndi Kleen 
1985a3437870SNishanth Aravamudan 	return 1;
1986a3437870SNishanth Aravamudan }
1987e11bfbfcSNick Piggin __setup("hugepages=", hugetlb_nrpages_setup);
1988e11bfbfcSNick Piggin 
1989e11bfbfcSNick Piggin static int __init hugetlb_default_setup(char *s)
1990e11bfbfcSNick Piggin {
1991e11bfbfcSNick Piggin 	default_hstate_size = memparse(s, &s);
1992e11bfbfcSNick Piggin 	return 1;
1993e11bfbfcSNick Piggin }
1994e11bfbfcSNick Piggin __setup("default_hugepagesz=", hugetlb_default_setup);
1995a3437870SNishanth Aravamudan 
19968a213460SNishanth Aravamudan static unsigned int cpuset_mems_nr(unsigned int *array)
19978a213460SNishanth Aravamudan {
19988a213460SNishanth Aravamudan 	int node;
19998a213460SNishanth Aravamudan 	unsigned int nr = 0;
20008a213460SNishanth Aravamudan 
20018a213460SNishanth Aravamudan 	for_each_node_mask(node, cpuset_current_mems_allowed)
20028a213460SNishanth Aravamudan 		nr += array[node];
20038a213460SNishanth Aravamudan 
20048a213460SNishanth Aravamudan 	return nr;
20058a213460SNishanth Aravamudan }
20068a213460SNishanth Aravamudan 
20078a213460SNishanth Aravamudan #ifdef CONFIG_SYSCTL
200806808b08SLee Schermerhorn static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
200906808b08SLee Schermerhorn 			 struct ctl_table *table, int write,
201006808b08SLee Schermerhorn 			 void __user *buffer, size_t *length, loff_t *ppos)
20111da177e4SLinus Torvalds {
2012e5ff2159SAndi Kleen 	struct hstate *h = &default_hstate;
2013e5ff2159SAndi Kleen 	unsigned long tmp;
201408d4a246SMichal Hocko 	int ret;
2015e5ff2159SAndi Kleen 
2016e5ff2159SAndi Kleen 	tmp = h->max_huge_pages;
2017e5ff2159SAndi Kleen 
2018adbe8726SEric B Munson 	if (write && h->order >= MAX_ORDER)
2019adbe8726SEric B Munson 		return -EINVAL;
2020adbe8726SEric B Munson 
2021e5ff2159SAndi Kleen 	table->data = &tmp;
2022e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
202308d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
202408d4a246SMichal Hocko 	if (ret)
202508d4a246SMichal Hocko 		goto out;
2026e5ff2159SAndi Kleen 
202706808b08SLee Schermerhorn 	if (write) {
2028bad44b5bSDavid Rientjes 		NODEMASK_ALLOC(nodemask_t, nodes_allowed,
2029bad44b5bSDavid Rientjes 						GFP_KERNEL | __GFP_NORETRY);
203006808b08SLee Schermerhorn 		if (!(obey_mempolicy &&
203106808b08SLee Schermerhorn 			       init_nodemask_of_mempolicy(nodes_allowed))) {
203206808b08SLee Schermerhorn 			NODEMASK_FREE(nodes_allowed);
20338cebfcd0SLai Jiangshan 			nodes_allowed = &node_states[N_MEMORY];
203406808b08SLee Schermerhorn 		}
203506808b08SLee Schermerhorn 		h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
203606808b08SLee Schermerhorn 
20378cebfcd0SLai Jiangshan 		if (nodes_allowed != &node_states[N_MEMORY])
203806808b08SLee Schermerhorn 			NODEMASK_FREE(nodes_allowed);
203906808b08SLee Schermerhorn 	}
204008d4a246SMichal Hocko out:
204108d4a246SMichal Hocko 	return ret;
20421da177e4SLinus Torvalds }
2043396faf03SMel Gorman 
204406808b08SLee Schermerhorn int hugetlb_sysctl_handler(struct ctl_table *table, int write,
204506808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
204606808b08SLee Schermerhorn {
204706808b08SLee Schermerhorn 
204806808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(false, table, write,
204906808b08SLee Schermerhorn 							buffer, length, ppos);
205006808b08SLee Schermerhorn }
205106808b08SLee Schermerhorn 
205206808b08SLee Schermerhorn #ifdef CONFIG_NUMA
205306808b08SLee Schermerhorn int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
205406808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
205506808b08SLee Schermerhorn {
205606808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(true, table, write,
205706808b08SLee Schermerhorn 							buffer, length, ppos);
205806808b08SLee Schermerhorn }
205906808b08SLee Schermerhorn #endif /* CONFIG_NUMA */
206006808b08SLee Schermerhorn 
2061396faf03SMel Gorman int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
20628d65af78SAlexey Dobriyan 			void __user *buffer,
2063396faf03SMel Gorman 			size_t *length, loff_t *ppos)
2064396faf03SMel Gorman {
20658d65af78SAlexey Dobriyan 	proc_dointvec(table, write, buffer, length, ppos);
2066396faf03SMel Gorman 	if (hugepages_treat_as_movable)
2067396faf03SMel Gorman 		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
2068396faf03SMel Gorman 	else
2069396faf03SMel Gorman 		htlb_alloc_mask = GFP_HIGHUSER;
2070396faf03SMel Gorman 	return 0;
2071396faf03SMel Gorman }
2072396faf03SMel Gorman 
2073a3d0c6aaSNishanth Aravamudan int hugetlb_overcommit_handler(struct ctl_table *table, int write,
20748d65af78SAlexey Dobriyan 			void __user *buffer,
2075a3d0c6aaSNishanth Aravamudan 			size_t *length, loff_t *ppos)
2076a3d0c6aaSNishanth Aravamudan {
2077a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2078e5ff2159SAndi Kleen 	unsigned long tmp;
207908d4a246SMichal Hocko 	int ret;
2080e5ff2159SAndi Kleen 
2081e5ff2159SAndi Kleen 	tmp = h->nr_overcommit_huge_pages;
2082e5ff2159SAndi Kleen 
2083adbe8726SEric B Munson 	if (write && h->order >= MAX_ORDER)
2084adbe8726SEric B Munson 		return -EINVAL;
2085adbe8726SEric B Munson 
2086e5ff2159SAndi Kleen 	table->data = &tmp;
2087e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
208808d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
208908d4a246SMichal Hocko 	if (ret)
209008d4a246SMichal Hocko 		goto out;
2091e5ff2159SAndi Kleen 
2092e5ff2159SAndi Kleen 	if (write) {
2093064d9efeSNishanth Aravamudan 		spin_lock(&hugetlb_lock);
2094e5ff2159SAndi Kleen 		h->nr_overcommit_huge_pages = tmp;
2095a3d0c6aaSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
2096e5ff2159SAndi Kleen 	}
209708d4a246SMichal Hocko out:
209808d4a246SMichal Hocko 	return ret;
2099a3d0c6aaSNishanth Aravamudan }
2100a3d0c6aaSNishanth Aravamudan 
21011da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
21021da177e4SLinus Torvalds 
2103e1759c21SAlexey Dobriyan void hugetlb_report_meminfo(struct seq_file *m)
21041da177e4SLinus Torvalds {
2105a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2106e1759c21SAlexey Dobriyan 	seq_printf(m,
21071da177e4SLinus Torvalds 			"HugePages_Total:   %5lu\n"
21081da177e4SLinus Torvalds 			"HugePages_Free:    %5lu\n"
2109b45b5bd6SDavid Gibson 			"HugePages_Rsvd:    %5lu\n"
21107893d1d5SAdam Litke 			"HugePages_Surp:    %5lu\n"
21114f98a2feSRik van Riel 			"Hugepagesize:   %8lu kB\n",
2112a5516438SAndi Kleen 			h->nr_huge_pages,
2113a5516438SAndi Kleen 			h->free_huge_pages,
2114a5516438SAndi Kleen 			h->resv_huge_pages,
2115a5516438SAndi Kleen 			h->surplus_huge_pages,
2116a5516438SAndi Kleen 			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
21171da177e4SLinus Torvalds }
21181da177e4SLinus Torvalds 
21191da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
21201da177e4SLinus Torvalds {
2121a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
21221da177e4SLinus Torvalds 	return sprintf(buf,
21231da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
2124a1de0919SNishanth Aravamudan 		"Node %d HugePages_Free:  %5u\n"
2125a1de0919SNishanth Aravamudan 		"Node %d HugePages_Surp:  %5u\n",
2126a5516438SAndi Kleen 		nid, h->nr_huge_pages_node[nid],
2127a5516438SAndi Kleen 		nid, h->free_huge_pages_node[nid],
2128a5516438SAndi Kleen 		nid, h->surplus_huge_pages_node[nid]);
21291da177e4SLinus Torvalds }
21301da177e4SLinus Torvalds 
2131949f7ec5SDavid Rientjes void hugetlb_show_meminfo(void)
2132949f7ec5SDavid Rientjes {
2133949f7ec5SDavid Rientjes 	struct hstate *h;
2134949f7ec5SDavid Rientjes 	int nid;
2135949f7ec5SDavid Rientjes 
2136949f7ec5SDavid Rientjes 	for_each_node_state(nid, N_MEMORY)
2137949f7ec5SDavid Rientjes 		for_each_hstate(h)
2138949f7ec5SDavid Rientjes 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2139949f7ec5SDavid Rientjes 				nid,
2140949f7ec5SDavid Rientjes 				h->nr_huge_pages_node[nid],
2141949f7ec5SDavid Rientjes 				h->free_huge_pages_node[nid],
2142949f7ec5SDavid Rientjes 				h->surplus_huge_pages_node[nid],
2143949f7ec5SDavid Rientjes 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2144949f7ec5SDavid Rientjes }
2145949f7ec5SDavid Rientjes 
21461da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
21471da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
21481da177e4SLinus Torvalds {
2149d0028588SWanpeng Li 	struct hstate *h;
2150d0028588SWanpeng Li 	unsigned long nr_total_pages = 0;
2151d0028588SWanpeng Li 
2152d0028588SWanpeng Li 	for_each_hstate(h)
2153d0028588SWanpeng Li 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2154d0028588SWanpeng Li 	return nr_total_pages;
21551da177e4SLinus Torvalds }
21561da177e4SLinus Torvalds 
2157a5516438SAndi Kleen static int hugetlb_acct_memory(struct hstate *h, long delta)
2158fc1b8a73SMel Gorman {
2159fc1b8a73SMel Gorman 	int ret = -ENOMEM;
2160fc1b8a73SMel Gorman 
2161fc1b8a73SMel Gorman 	spin_lock(&hugetlb_lock);
2162fc1b8a73SMel Gorman 	/*
2163fc1b8a73SMel Gorman 	 * When cpuset is configured, it breaks the strict hugetlb page
2164fc1b8a73SMel Gorman 	 * reservation as the accounting is done on a global variable. Such
2165fc1b8a73SMel Gorman 	 * reservation is completely rubbish in the presence of cpuset because
2166fc1b8a73SMel Gorman 	 * the reservation is not checked against page availability for the
2167fc1b8a73SMel Gorman 	 * current cpuset. Application can still potentially OOM'ed by kernel
2168fc1b8a73SMel Gorman 	 * with lack of free htlb page in cpuset that the task is in.
2169fc1b8a73SMel Gorman 	 * Attempt to enforce strict accounting with cpuset is almost
2170fc1b8a73SMel Gorman 	 * impossible (or too ugly) because cpuset is too fluid that
2171fc1b8a73SMel Gorman 	 * task or memory node can be dynamically moved between cpusets.
2172fc1b8a73SMel Gorman 	 *
2173fc1b8a73SMel Gorman 	 * The change of semantics for shared hugetlb mapping with cpuset is
2174fc1b8a73SMel Gorman 	 * undesirable. However, in order to preserve some of the semantics,
2175fc1b8a73SMel Gorman 	 * we fall back to check against current free page availability as
2176fc1b8a73SMel Gorman 	 * a best attempt and hopefully to minimize the impact of changing
2177fc1b8a73SMel Gorman 	 * semantics that cpuset has.
2178fc1b8a73SMel Gorman 	 */
2179fc1b8a73SMel Gorman 	if (delta > 0) {
2180a5516438SAndi Kleen 		if (gather_surplus_pages(h, delta) < 0)
2181fc1b8a73SMel Gorman 			goto out;
2182fc1b8a73SMel Gorman 
2183a5516438SAndi Kleen 		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2184a5516438SAndi Kleen 			return_unused_surplus_pages(h, delta);
2185fc1b8a73SMel Gorman 			goto out;
2186fc1b8a73SMel Gorman 		}
2187fc1b8a73SMel Gorman 	}
2188fc1b8a73SMel Gorman 
2189fc1b8a73SMel Gorman 	ret = 0;
2190fc1b8a73SMel Gorman 	if (delta < 0)
2191a5516438SAndi Kleen 		return_unused_surplus_pages(h, (unsigned long) -delta);
2192fc1b8a73SMel Gorman 
2193fc1b8a73SMel Gorman out:
2194fc1b8a73SMel Gorman 	spin_unlock(&hugetlb_lock);
2195fc1b8a73SMel Gorman 	return ret;
2196fc1b8a73SMel Gorman }
2197fc1b8a73SMel Gorman 
219884afd99bSAndy Whitcroft static void hugetlb_vm_op_open(struct vm_area_struct *vma)
219984afd99bSAndy Whitcroft {
2200f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
220184afd99bSAndy Whitcroft 
220284afd99bSAndy Whitcroft 	/*
220384afd99bSAndy Whitcroft 	 * This new VMA should share its siblings reservation map if present.
220484afd99bSAndy Whitcroft 	 * The VMA will only ever have a valid reservation map pointer where
220584afd99bSAndy Whitcroft 	 * it is being copied for another still existing VMA.  As that VMA
220625985edcSLucas De Marchi 	 * has a reference to the reservation map it cannot disappear until
220784afd99bSAndy Whitcroft 	 * after this open call completes.  It is therefore safe to take a
220884afd99bSAndy Whitcroft 	 * new reference here without additional locking.
220984afd99bSAndy Whitcroft 	 */
2210f522c3acSJoonsoo Kim 	if (resv)
2211f522c3acSJoonsoo Kim 		kref_get(&resv->refs);
221284afd99bSAndy Whitcroft }
221384afd99bSAndy Whitcroft 
2214c50ac050SDave Hansen static void resv_map_put(struct vm_area_struct *vma)
2215c50ac050SDave Hansen {
2216f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
2217c50ac050SDave Hansen 
2218f522c3acSJoonsoo Kim 	if (!resv)
2219c50ac050SDave Hansen 		return;
2220f522c3acSJoonsoo Kim 	kref_put(&resv->refs, resv_map_release);
2221c50ac050SDave Hansen }
2222c50ac050SDave Hansen 
2223a1e78772SMel Gorman static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2224a1e78772SMel Gorman {
2225a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2226f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
222790481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
222884afd99bSAndy Whitcroft 	unsigned long reserve;
222984afd99bSAndy Whitcroft 	unsigned long start;
223084afd99bSAndy Whitcroft 	unsigned long end;
223184afd99bSAndy Whitcroft 
2232f522c3acSJoonsoo Kim 	if (resv) {
2233a5516438SAndi Kleen 		start = vma_hugecache_offset(h, vma, vma->vm_start);
2234a5516438SAndi Kleen 		end = vma_hugecache_offset(h, vma, vma->vm_end);
223584afd99bSAndy Whitcroft 
223684afd99bSAndy Whitcroft 		reserve = (end - start) -
2237f522c3acSJoonsoo Kim 			region_count(&resv->regions, start, end);
223884afd99bSAndy Whitcroft 
2239c50ac050SDave Hansen 		resv_map_put(vma);
224084afd99bSAndy Whitcroft 
22417251ff78SAdam Litke 		if (reserve) {
2242a5516438SAndi Kleen 			hugetlb_acct_memory(h, -reserve);
224390481622SDavid Gibson 			hugepage_subpool_put_pages(spool, reserve);
22447251ff78SAdam Litke 		}
2245a1e78772SMel Gorman 	}
224684afd99bSAndy Whitcroft }
2247a1e78772SMel Gorman 
22481da177e4SLinus Torvalds /*
22491da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
22501da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
22511da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
22521da177e4SLinus Torvalds  * this far.
22531da177e4SLinus Torvalds  */
2254d0217ac0SNick Piggin static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
22551da177e4SLinus Torvalds {
22561da177e4SLinus Torvalds 	BUG();
2257d0217ac0SNick Piggin 	return 0;
22581da177e4SLinus Torvalds }
22591da177e4SLinus Torvalds 
2260f0f37e2fSAlexey Dobriyan const struct vm_operations_struct hugetlb_vm_ops = {
2261d0217ac0SNick Piggin 	.fault = hugetlb_vm_op_fault,
226284afd99bSAndy Whitcroft 	.open = hugetlb_vm_op_open,
2263a1e78772SMel Gorman 	.close = hugetlb_vm_op_close,
22641da177e4SLinus Torvalds };
22651da177e4SLinus Torvalds 
22661e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
22671e8f889bSDavid Gibson 				int writable)
226863551ae0SDavid Gibson {
226963551ae0SDavid Gibson 	pte_t entry;
227063551ae0SDavid Gibson 
22711e8f889bSDavid Gibson 	if (writable) {
2272106c992aSGerald Schaefer 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
2273106c992aSGerald Schaefer 					 vma->vm_page_prot)));
227463551ae0SDavid Gibson 	} else {
2275106c992aSGerald Schaefer 		entry = huge_pte_wrprotect(mk_huge_pte(page,
2276106c992aSGerald Schaefer 					   vma->vm_page_prot));
227763551ae0SDavid Gibson 	}
227863551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
227963551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
2280d9ed9faaSChris Metcalf 	entry = arch_make_huge_pte(entry, vma, page, writable);
228163551ae0SDavid Gibson 
228263551ae0SDavid Gibson 	return entry;
228363551ae0SDavid Gibson }
228463551ae0SDavid Gibson 
22851e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
22861e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
22871e8f889bSDavid Gibson {
22881e8f889bSDavid Gibson 	pte_t entry;
22891e8f889bSDavid Gibson 
2290106c992aSGerald Schaefer 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
229132f84528SChris Forbes 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
22924b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
22931e8f889bSDavid Gibson }
22941e8f889bSDavid Gibson 
22951e8f889bSDavid Gibson 
229663551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
229763551ae0SDavid Gibson 			    struct vm_area_struct *vma)
229863551ae0SDavid Gibson {
229963551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
230063551ae0SDavid Gibson 	struct page *ptepage;
23011c59827dSHugh Dickins 	unsigned long addr;
23021e8f889bSDavid Gibson 	int cow;
2303a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2304a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
23051e8f889bSDavid Gibson 
23061e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
230763551ae0SDavid Gibson 
2308a5516438SAndi Kleen 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2309c74df32cSHugh Dickins 		src_pte = huge_pte_offset(src, addr);
2310c74df32cSHugh Dickins 		if (!src_pte)
2311c74df32cSHugh Dickins 			continue;
2312a5516438SAndi Kleen 		dst_pte = huge_pte_alloc(dst, addr, sz);
231363551ae0SDavid Gibson 		if (!dst_pte)
231463551ae0SDavid Gibson 			goto nomem;
2315c5c99429SLarry Woodman 
2316c5c99429SLarry Woodman 		/* If the pagetables are shared don't copy or take references */
2317c5c99429SLarry Woodman 		if (dst_pte == src_pte)
2318c5c99429SLarry Woodman 			continue;
2319c5c99429SLarry Woodman 
2320c74df32cSHugh Dickins 		spin_lock(&dst->page_table_lock);
232146478758SNick Piggin 		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
23227f2e9525SGerald Schaefer 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
23231e8f889bSDavid Gibson 			if (cow)
23247f2e9525SGerald Schaefer 				huge_ptep_set_wrprotect(src, addr, src_pte);
23257f2e9525SGerald Schaefer 			entry = huge_ptep_get(src_pte);
232663551ae0SDavid Gibson 			ptepage = pte_page(entry);
232763551ae0SDavid Gibson 			get_page(ptepage);
23280fe6e20bSNaoya Horiguchi 			page_dup_rmap(ptepage);
232963551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
23301c59827dSHugh Dickins 		}
23311c59827dSHugh Dickins 		spin_unlock(&src->page_table_lock);
2332c74df32cSHugh Dickins 		spin_unlock(&dst->page_table_lock);
233363551ae0SDavid Gibson 	}
233463551ae0SDavid Gibson 	return 0;
233563551ae0SDavid Gibson 
233663551ae0SDavid Gibson nomem:
233763551ae0SDavid Gibson 	return -ENOMEM;
233863551ae0SDavid Gibson }
233963551ae0SDavid Gibson 
2340290408d4SNaoya Horiguchi static int is_hugetlb_entry_migration(pte_t pte)
2341290408d4SNaoya Horiguchi {
2342290408d4SNaoya Horiguchi 	swp_entry_t swp;
2343290408d4SNaoya Horiguchi 
2344290408d4SNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
2345290408d4SNaoya Horiguchi 		return 0;
2346290408d4SNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
234732f84528SChris Forbes 	if (non_swap_entry(swp) && is_migration_entry(swp))
2348290408d4SNaoya Horiguchi 		return 1;
234932f84528SChris Forbes 	else
2350290408d4SNaoya Horiguchi 		return 0;
2351290408d4SNaoya Horiguchi }
2352290408d4SNaoya Horiguchi 
2353fd6a03edSNaoya Horiguchi static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2354fd6a03edSNaoya Horiguchi {
2355fd6a03edSNaoya Horiguchi 	swp_entry_t swp;
2356fd6a03edSNaoya Horiguchi 
2357fd6a03edSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
2358fd6a03edSNaoya Horiguchi 		return 0;
2359fd6a03edSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
236032f84528SChris Forbes 	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2361fd6a03edSNaoya Horiguchi 		return 1;
236232f84528SChris Forbes 	else
2363fd6a03edSNaoya Horiguchi 		return 0;
2364fd6a03edSNaoya Horiguchi }
2365fd6a03edSNaoya Horiguchi 
236624669e58SAneesh Kumar K.V void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
236724669e58SAneesh Kumar K.V 			    unsigned long start, unsigned long end,
236824669e58SAneesh Kumar K.V 			    struct page *ref_page)
236963551ae0SDavid Gibson {
237024669e58SAneesh Kumar K.V 	int force_flush = 0;
237163551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
237263551ae0SDavid Gibson 	unsigned long address;
2373c7546f8fSDavid Gibson 	pte_t *ptep;
237463551ae0SDavid Gibson 	pte_t pte;
237563551ae0SDavid Gibson 	struct page *page;
2376a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2377a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
23782ec74c3eSSagi Grimberg 	const unsigned long mmun_start = start;	/* For mmu_notifiers */
23792ec74c3eSSagi Grimberg 	const unsigned long mmun_end   = end;	/* For mmu_notifiers */
2380a5516438SAndi Kleen 
238163551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
2382a5516438SAndi Kleen 	BUG_ON(start & ~huge_page_mask(h));
2383a5516438SAndi Kleen 	BUG_ON(end & ~huge_page_mask(h));
238463551ae0SDavid Gibson 
238524669e58SAneesh Kumar K.V 	tlb_start_vma(tlb, vma);
23862ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
238724669e58SAneesh Kumar K.V again:
2388508034a3SHugh Dickins 	spin_lock(&mm->page_table_lock);
2389a5516438SAndi Kleen 	for (address = start; address < end; address += sz) {
2390c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
2391c7546f8fSDavid Gibson 		if (!ptep)
2392c7546f8fSDavid Gibson 			continue;
2393c7546f8fSDavid Gibson 
239439dde65cSChen, Kenneth W 		if (huge_pmd_unshare(mm, &address, ptep))
239539dde65cSChen, Kenneth W 			continue;
239639dde65cSChen, Kenneth W 
23976629326bSHillf Danton 		pte = huge_ptep_get(ptep);
23986629326bSHillf Danton 		if (huge_pte_none(pte))
23996629326bSHillf Danton 			continue;
24006629326bSHillf Danton 
24016629326bSHillf Danton 		/*
24026629326bSHillf Danton 		 * HWPoisoned hugepage is already unmapped and dropped reference
24036629326bSHillf Danton 		 */
24048c4894c6SNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2405106c992aSGerald Schaefer 			huge_pte_clear(mm, address, ptep);
24066629326bSHillf Danton 			continue;
24078c4894c6SNaoya Horiguchi 		}
24086629326bSHillf Danton 
24096629326bSHillf Danton 		page = pte_page(pte);
241004f2cbe3SMel Gorman 		/*
241104f2cbe3SMel Gorman 		 * If a reference page is supplied, it is because a specific
241204f2cbe3SMel Gorman 		 * page is being unmapped, not a range. Ensure the page we
241304f2cbe3SMel Gorman 		 * are about to unmap is the actual page of interest.
241404f2cbe3SMel Gorman 		 */
241504f2cbe3SMel Gorman 		if (ref_page) {
241604f2cbe3SMel Gorman 			if (page != ref_page)
241704f2cbe3SMel Gorman 				continue;
241804f2cbe3SMel Gorman 
241904f2cbe3SMel Gorman 			/*
242004f2cbe3SMel Gorman 			 * Mark the VMA as having unmapped its page so that
242104f2cbe3SMel Gorman 			 * future faults in this VMA will fail rather than
242204f2cbe3SMel Gorman 			 * looking like data was lost
242304f2cbe3SMel Gorman 			 */
242404f2cbe3SMel Gorman 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
242504f2cbe3SMel Gorman 		}
242604f2cbe3SMel Gorman 
2427c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
242824669e58SAneesh Kumar K.V 		tlb_remove_tlb_entry(tlb, ptep, address);
2429106c992aSGerald Schaefer 		if (huge_pte_dirty(pte))
24306649a386SKen Chen 			set_page_dirty(page);
24319e81130bSHillf Danton 
243224669e58SAneesh Kumar K.V 		page_remove_rmap(page);
243324669e58SAneesh Kumar K.V 		force_flush = !__tlb_remove_page(tlb, page);
243424669e58SAneesh Kumar K.V 		if (force_flush)
243524669e58SAneesh Kumar K.V 			break;
24369e81130bSHillf Danton 		/* Bail out after unmapping reference page if supplied */
24379e81130bSHillf Danton 		if (ref_page)
24389e81130bSHillf Danton 			break;
243963551ae0SDavid Gibson 	}
2440cd2934a3SAl Viro 	spin_unlock(&mm->page_table_lock);
244124669e58SAneesh Kumar K.V 	/*
244224669e58SAneesh Kumar K.V 	 * mmu_gather ran out of room to batch pages, we break out of
244324669e58SAneesh Kumar K.V 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
244424669e58SAneesh Kumar K.V 	 * and page-free while holding it.
244524669e58SAneesh Kumar K.V 	 */
244624669e58SAneesh Kumar K.V 	if (force_flush) {
244724669e58SAneesh Kumar K.V 		force_flush = 0;
244824669e58SAneesh Kumar K.V 		tlb_flush_mmu(tlb);
244924669e58SAneesh Kumar K.V 		if (address < end && !ref_page)
245024669e58SAneesh Kumar K.V 			goto again;
2451fe1668aeSChen, Kenneth W 	}
24522ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
245324669e58SAneesh Kumar K.V 	tlb_end_vma(tlb, vma);
24541da177e4SLinus Torvalds }
245563551ae0SDavid Gibson 
2456d833352aSMel Gorman void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2457d833352aSMel Gorman 			  struct vm_area_struct *vma, unsigned long start,
2458d833352aSMel Gorman 			  unsigned long end, struct page *ref_page)
2459d833352aSMel Gorman {
2460d833352aSMel Gorman 	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
2461d833352aSMel Gorman 
2462d833352aSMel Gorman 	/*
2463d833352aSMel Gorman 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2464d833352aSMel Gorman 	 * test will fail on a vma being torn down, and not grab a page table
2465d833352aSMel Gorman 	 * on its way out.  We're lucky that the flag has such an appropriate
2466d833352aSMel Gorman 	 * name, and can in fact be safely cleared here. We could clear it
2467d833352aSMel Gorman 	 * before the __unmap_hugepage_range above, but all that's necessary
2468d833352aSMel Gorman 	 * is to clear it before releasing the i_mmap_mutex. This works
2469d833352aSMel Gorman 	 * because in the context this is called, the VMA is about to be
2470d833352aSMel Gorman 	 * destroyed and the i_mmap_mutex is held.
2471d833352aSMel Gorman 	 */
2472d833352aSMel Gorman 	vma->vm_flags &= ~VM_MAYSHARE;
2473d833352aSMel Gorman }
2474d833352aSMel Gorman 
2475502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
247604f2cbe3SMel Gorman 			  unsigned long end, struct page *ref_page)
2477502717f4SChen, Kenneth W {
247824669e58SAneesh Kumar K.V 	struct mm_struct *mm;
247924669e58SAneesh Kumar K.V 	struct mmu_gather tlb;
248024669e58SAneesh Kumar K.V 
248124669e58SAneesh Kumar K.V 	mm = vma->vm_mm;
248224669e58SAneesh Kumar K.V 
24832b047252SLinus Torvalds 	tlb_gather_mmu(&tlb, mm, start, end);
248424669e58SAneesh Kumar K.V 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
248524669e58SAneesh Kumar K.V 	tlb_finish_mmu(&tlb, start, end);
2486502717f4SChen, Kenneth W }
2487502717f4SChen, Kenneth W 
248804f2cbe3SMel Gorman /*
248904f2cbe3SMel Gorman  * This is called when the original mapper is failing to COW a MAP_PRIVATE
249004f2cbe3SMel Gorman  * mappping it owns the reserve page for. The intention is to unmap the page
249104f2cbe3SMel Gorman  * from other VMAs and let the children be SIGKILLed if they are faulting the
249204f2cbe3SMel Gorman  * same region.
249304f2cbe3SMel Gorman  */
24942a4b3dedSHarvey Harrison static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
24952a4b3dedSHarvey Harrison 				struct page *page, unsigned long address)
249604f2cbe3SMel Gorman {
24977526674dSAdam Litke 	struct hstate *h = hstate_vma(vma);
249804f2cbe3SMel Gorman 	struct vm_area_struct *iter_vma;
249904f2cbe3SMel Gorman 	struct address_space *mapping;
250004f2cbe3SMel Gorman 	pgoff_t pgoff;
250104f2cbe3SMel Gorman 
250204f2cbe3SMel Gorman 	/*
250304f2cbe3SMel Gorman 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
250404f2cbe3SMel Gorman 	 * from page cache lookup which is in HPAGE_SIZE units.
250504f2cbe3SMel Gorman 	 */
25067526674dSAdam Litke 	address = address & huge_page_mask(h);
250736e4f20aSMichal Hocko 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
250836e4f20aSMichal Hocko 			vma->vm_pgoff;
2509496ad9aaSAl Viro 	mapping = file_inode(vma->vm_file)->i_mapping;
251004f2cbe3SMel Gorman 
25114eb2b1dcSMel Gorman 	/*
25124eb2b1dcSMel Gorman 	 * Take the mapping lock for the duration of the table walk. As
25134eb2b1dcSMel Gorman 	 * this mapping should be shared between all the VMAs,
25144eb2b1dcSMel Gorman 	 * __unmap_hugepage_range() is called as the lock is already held
25154eb2b1dcSMel Gorman 	 */
25163d48ae45SPeter Zijlstra 	mutex_lock(&mapping->i_mmap_mutex);
25176b2dbba8SMichel Lespinasse 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
251804f2cbe3SMel Gorman 		/* Do not unmap the current VMA */
251904f2cbe3SMel Gorman 		if (iter_vma == vma)
252004f2cbe3SMel Gorman 			continue;
252104f2cbe3SMel Gorman 
252204f2cbe3SMel Gorman 		/*
252304f2cbe3SMel Gorman 		 * Unmap the page from other VMAs without their own reserves.
252404f2cbe3SMel Gorman 		 * They get marked to be SIGKILLed if they fault in these
252504f2cbe3SMel Gorman 		 * areas. This is because a future no-page fault on this VMA
252604f2cbe3SMel Gorman 		 * could insert a zeroed page instead of the data existing
252704f2cbe3SMel Gorman 		 * from the time of fork. This would look like data corruption
252804f2cbe3SMel Gorman 		 */
252904f2cbe3SMel Gorman 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
253024669e58SAneesh Kumar K.V 			unmap_hugepage_range(iter_vma, address,
253124669e58SAneesh Kumar K.V 					     address + huge_page_size(h), page);
253204f2cbe3SMel Gorman 	}
25333d48ae45SPeter Zijlstra 	mutex_unlock(&mapping->i_mmap_mutex);
253404f2cbe3SMel Gorman 
253504f2cbe3SMel Gorman 	return 1;
253604f2cbe3SMel Gorman }
253704f2cbe3SMel Gorman 
25380fe6e20bSNaoya Horiguchi /*
25390fe6e20bSNaoya Horiguchi  * Hugetlb_cow() should be called with page lock of the original hugepage held.
2540ef009b25SMichal Hocko  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
2541ef009b25SMichal Hocko  * cannot race with other handlers or page migration.
2542ef009b25SMichal Hocko  * Keep the pte_same checks anyway to make transition from the mutex easier.
25430fe6e20bSNaoya Horiguchi  */
25441e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
254504f2cbe3SMel Gorman 			unsigned long address, pte_t *ptep, pte_t pte,
254604f2cbe3SMel Gorman 			struct page *pagecache_page)
25471e8f889bSDavid Gibson {
2548a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
25491e8f889bSDavid Gibson 	struct page *old_page, *new_page;
255004f2cbe3SMel Gorman 	int outside_reserve = 0;
25512ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
25522ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
25531e8f889bSDavid Gibson 
25541e8f889bSDavid Gibson 	old_page = pte_page(pte);
25551e8f889bSDavid Gibson 
255604f2cbe3SMel Gorman retry_avoidcopy:
25571e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
25581e8f889bSDavid Gibson 	 * and just make the page writable */
255937a2140dSJoonsoo Kim 	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
25600fe6e20bSNaoya Horiguchi 		page_move_anon_rmap(old_page, vma, address);
25611e8f889bSDavid Gibson 		set_huge_ptep_writable(vma, address, ptep);
256283c54070SNick Piggin 		return 0;
25631e8f889bSDavid Gibson 	}
25641e8f889bSDavid Gibson 
256504f2cbe3SMel Gorman 	/*
256604f2cbe3SMel Gorman 	 * If the process that created a MAP_PRIVATE mapping is about to
256704f2cbe3SMel Gorman 	 * perform a COW due to a shared page count, attempt to satisfy
256804f2cbe3SMel Gorman 	 * the allocation without using the existing reserves. The pagecache
256904f2cbe3SMel Gorman 	 * page is used to determine if the reserve at this address was
257004f2cbe3SMel Gorman 	 * consumed or not. If reserves were used, a partial faulted mapping
257104f2cbe3SMel Gorman 	 * at the time of fork() could consume its reserves on COW instead
257204f2cbe3SMel Gorman 	 * of the full address range.
257304f2cbe3SMel Gorman 	 */
25745944d011SJoonsoo Kim 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
257504f2cbe3SMel Gorman 			old_page != pagecache_page)
257604f2cbe3SMel Gorman 		outside_reserve = 1;
257704f2cbe3SMel Gorman 
25781e8f889bSDavid Gibson 	page_cache_get(old_page);
2579b76c8cfbSLarry Woodman 
2580b76c8cfbSLarry Woodman 	/* Drop page_table_lock as buddy allocator may be called */
2581b76c8cfbSLarry Woodman 	spin_unlock(&mm->page_table_lock);
258204f2cbe3SMel Gorman 	new_page = alloc_huge_page(vma, address, outside_reserve);
25831e8f889bSDavid Gibson 
25842fc39cecSAdam Litke 	if (IS_ERR(new_page)) {
258576dcee75SAneesh Kumar K.V 		long err = PTR_ERR(new_page);
25861e8f889bSDavid Gibson 		page_cache_release(old_page);
258704f2cbe3SMel Gorman 
258804f2cbe3SMel Gorman 		/*
258904f2cbe3SMel Gorman 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
259004f2cbe3SMel Gorman 		 * it is due to references held by a child and an insufficient
259104f2cbe3SMel Gorman 		 * huge page pool. To guarantee the original mappers
259204f2cbe3SMel Gorman 		 * reliability, unmap the page from child processes. The child
259304f2cbe3SMel Gorman 		 * may get SIGKILLed if it later faults.
259404f2cbe3SMel Gorman 		 */
259504f2cbe3SMel Gorman 		if (outside_reserve) {
259604f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
259704f2cbe3SMel Gorman 			if (unmap_ref_private(mm, vma, old_page, address)) {
259804f2cbe3SMel Gorman 				BUG_ON(huge_pte_none(pte));
2599b76c8cfbSLarry Woodman 				spin_lock(&mm->page_table_lock);
2600a734bcc8SHillf Danton 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2601a734bcc8SHillf Danton 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
260204f2cbe3SMel Gorman 					goto retry_avoidcopy;
2603a734bcc8SHillf Danton 				/*
2604a734bcc8SHillf Danton 				 * race occurs while re-acquiring page_table_lock, and
2605a734bcc8SHillf Danton 				 * our job is done.
2606a734bcc8SHillf Danton 				 */
2607a734bcc8SHillf Danton 				return 0;
260804f2cbe3SMel Gorman 			}
260904f2cbe3SMel Gorman 			WARN_ON_ONCE(1);
261004f2cbe3SMel Gorman 		}
261104f2cbe3SMel Gorman 
2612b76c8cfbSLarry Woodman 		/* Caller expects lock to be held */
2613b76c8cfbSLarry Woodman 		spin_lock(&mm->page_table_lock);
261476dcee75SAneesh Kumar K.V 		if (err == -ENOMEM)
261576dcee75SAneesh Kumar K.V 			return VM_FAULT_OOM;
261676dcee75SAneesh Kumar K.V 		else
261776dcee75SAneesh Kumar K.V 			return VM_FAULT_SIGBUS;
26181e8f889bSDavid Gibson 	}
26191e8f889bSDavid Gibson 
26200fe6e20bSNaoya Horiguchi 	/*
26210fe6e20bSNaoya Horiguchi 	 * When the original hugepage is shared one, it does not have
26220fe6e20bSNaoya Horiguchi 	 * anon_vma prepared.
26230fe6e20bSNaoya Horiguchi 	 */
262444e2aa93SDean Nelson 	if (unlikely(anon_vma_prepare(vma))) {
2625ea4039a3SHillf Danton 		page_cache_release(new_page);
2626ea4039a3SHillf Danton 		page_cache_release(old_page);
262744e2aa93SDean Nelson 		/* Caller expects lock to be held */
262844e2aa93SDean Nelson 		spin_lock(&mm->page_table_lock);
26290fe6e20bSNaoya Horiguchi 		return VM_FAULT_OOM;
263044e2aa93SDean Nelson 	}
26310fe6e20bSNaoya Horiguchi 
263247ad8475SAndrea Arcangeli 	copy_user_huge_page(new_page, old_page, address, vma,
263347ad8475SAndrea Arcangeli 			    pages_per_huge_page(h));
26340ed361deSNick Piggin 	__SetPageUptodate(new_page);
26351e8f889bSDavid Gibson 
26362ec74c3eSSagi Grimberg 	mmun_start = address & huge_page_mask(h);
26372ec74c3eSSagi Grimberg 	mmun_end = mmun_start + huge_page_size(h);
26382ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2639b76c8cfbSLarry Woodman 	/*
2640b76c8cfbSLarry Woodman 	 * Retake the page_table_lock to check for racing updates
2641b76c8cfbSLarry Woodman 	 * before the page tables are altered
2642b76c8cfbSLarry Woodman 	 */
2643b76c8cfbSLarry Woodman 	spin_lock(&mm->page_table_lock);
2644a5516438SAndi Kleen 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
26457f2e9525SGerald Schaefer 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
264607443a85SJoonsoo Kim 		ClearPagePrivate(new_page);
264707443a85SJoonsoo Kim 
26481e8f889bSDavid Gibson 		/* Break COW */
26498fe627ecSGerald Schaefer 		huge_ptep_clear_flush(vma, address, ptep);
26501e8f889bSDavid Gibson 		set_huge_pte_at(mm, address, ptep,
26511e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
26520fe6e20bSNaoya Horiguchi 		page_remove_rmap(old_page);
2653cd67f0d2SNaoya Horiguchi 		hugepage_add_new_anon_rmap(new_page, vma, address);
26541e8f889bSDavid Gibson 		/* Make the old page be freed below */
26551e8f889bSDavid Gibson 		new_page = old_page;
26561e8f889bSDavid Gibson 	}
26572ec74c3eSSagi Grimberg 	spin_unlock(&mm->page_table_lock);
26582ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
26591e8f889bSDavid Gibson 	page_cache_release(new_page);
26601e8f889bSDavid Gibson 	page_cache_release(old_page);
26618312034fSJoonsoo Kim 
26628312034fSJoonsoo Kim 	/* Caller expects lock to be held */
26638312034fSJoonsoo Kim 	spin_lock(&mm->page_table_lock);
266483c54070SNick Piggin 	return 0;
26651e8f889bSDavid Gibson }
26661e8f889bSDavid Gibson 
266704f2cbe3SMel Gorman /* Return the pagecache page at a given address within a VMA */
2668a5516438SAndi Kleen static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2669a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
267004f2cbe3SMel Gorman {
267104f2cbe3SMel Gorman 	struct address_space *mapping;
2672e7c4b0bfSAndy Whitcroft 	pgoff_t idx;
267304f2cbe3SMel Gorman 
267404f2cbe3SMel Gorman 	mapping = vma->vm_file->f_mapping;
2675a5516438SAndi Kleen 	idx = vma_hugecache_offset(h, vma, address);
267604f2cbe3SMel Gorman 
267704f2cbe3SMel Gorman 	return find_lock_page(mapping, idx);
267804f2cbe3SMel Gorman }
267904f2cbe3SMel Gorman 
26803ae77f43SHugh Dickins /*
26813ae77f43SHugh Dickins  * Return whether there is a pagecache page to back given address within VMA.
26823ae77f43SHugh Dickins  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
26833ae77f43SHugh Dickins  */
26843ae77f43SHugh Dickins static bool hugetlbfs_pagecache_present(struct hstate *h,
26852a15efc9SHugh Dickins 			struct vm_area_struct *vma, unsigned long address)
26862a15efc9SHugh Dickins {
26872a15efc9SHugh Dickins 	struct address_space *mapping;
26882a15efc9SHugh Dickins 	pgoff_t idx;
26892a15efc9SHugh Dickins 	struct page *page;
26902a15efc9SHugh Dickins 
26912a15efc9SHugh Dickins 	mapping = vma->vm_file->f_mapping;
26922a15efc9SHugh Dickins 	idx = vma_hugecache_offset(h, vma, address);
26932a15efc9SHugh Dickins 
26942a15efc9SHugh Dickins 	page = find_get_page(mapping, idx);
26952a15efc9SHugh Dickins 	if (page)
26962a15efc9SHugh Dickins 		put_page(page);
26972a15efc9SHugh Dickins 	return page != NULL;
26982a15efc9SHugh Dickins }
26992a15efc9SHugh Dickins 
2700a1ed3ddaSRobert P. J. Day static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2701788c7df4SHugh Dickins 			unsigned long address, pte_t *ptep, unsigned int flags)
2702ac9b9c66SHugh Dickins {
2703a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2704ac9b9c66SHugh Dickins 	int ret = VM_FAULT_SIGBUS;
2705409eb8c2SHillf Danton 	int anon_rmap = 0;
2706e7c4b0bfSAndy Whitcroft 	pgoff_t idx;
27074c887265SAdam Litke 	unsigned long size;
27084c887265SAdam Litke 	struct page *page;
27094c887265SAdam Litke 	struct address_space *mapping;
27101e8f889bSDavid Gibson 	pte_t new_pte;
27114c887265SAdam Litke 
271204f2cbe3SMel Gorman 	/*
271304f2cbe3SMel Gorman 	 * Currently, we are forced to kill the process in the event the
271404f2cbe3SMel Gorman 	 * original mapper has unmapped pages from the child due to a failed
271525985edcSLucas De Marchi 	 * COW. Warn that such a situation has occurred as it may not be obvious
271604f2cbe3SMel Gorman 	 */
271704f2cbe3SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2718ffb22af5SAndrew Morton 		pr_warning("PID %d killed due to inadequate hugepage pool\n",
271904f2cbe3SMel Gorman 			   current->pid);
272004f2cbe3SMel Gorman 		return ret;
272104f2cbe3SMel Gorman 	}
272204f2cbe3SMel Gorman 
27234c887265SAdam Litke 	mapping = vma->vm_file->f_mapping;
2724a5516438SAndi Kleen 	idx = vma_hugecache_offset(h, vma, address);
27254c887265SAdam Litke 
27264c887265SAdam Litke 	/*
27274c887265SAdam Litke 	 * Use page lock to guard against racing truncation
27284c887265SAdam Litke 	 * before we get page_table_lock.
27294c887265SAdam Litke 	 */
27306bda666aSChristoph Lameter retry:
27316bda666aSChristoph Lameter 	page = find_lock_page(mapping, idx);
27326bda666aSChristoph Lameter 	if (!page) {
2733a5516438SAndi Kleen 		size = i_size_read(mapping->host) >> huge_page_shift(h);
2734ebed4bfcSHugh Dickins 		if (idx >= size)
2735ebed4bfcSHugh Dickins 			goto out;
273604f2cbe3SMel Gorman 		page = alloc_huge_page(vma, address, 0);
27372fc39cecSAdam Litke 		if (IS_ERR(page)) {
273876dcee75SAneesh Kumar K.V 			ret = PTR_ERR(page);
273976dcee75SAneesh Kumar K.V 			if (ret == -ENOMEM)
274076dcee75SAneesh Kumar K.V 				ret = VM_FAULT_OOM;
274176dcee75SAneesh Kumar K.V 			else
274276dcee75SAneesh Kumar K.V 				ret = VM_FAULT_SIGBUS;
27436bda666aSChristoph Lameter 			goto out;
27446bda666aSChristoph Lameter 		}
274547ad8475SAndrea Arcangeli 		clear_huge_page(page, address, pages_per_huge_page(h));
27460ed361deSNick Piggin 		__SetPageUptodate(page);
2747ac9b9c66SHugh Dickins 
2748f83a275dSMel Gorman 		if (vma->vm_flags & VM_MAYSHARE) {
27496bda666aSChristoph Lameter 			int err;
275045c682a6SKen Chen 			struct inode *inode = mapping->host;
27516bda666aSChristoph Lameter 
27526bda666aSChristoph Lameter 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
27536bda666aSChristoph Lameter 			if (err) {
27546bda666aSChristoph Lameter 				put_page(page);
27556bda666aSChristoph Lameter 				if (err == -EEXIST)
27566bda666aSChristoph Lameter 					goto retry;
27576bda666aSChristoph Lameter 				goto out;
27586bda666aSChristoph Lameter 			}
275907443a85SJoonsoo Kim 			ClearPagePrivate(page);
276045c682a6SKen Chen 
276145c682a6SKen Chen 			spin_lock(&inode->i_lock);
2762a5516438SAndi Kleen 			inode->i_blocks += blocks_per_huge_page(h);
276345c682a6SKen Chen 			spin_unlock(&inode->i_lock);
276423be7468SMel Gorman 		} else {
27656bda666aSChristoph Lameter 			lock_page(page);
27660fe6e20bSNaoya Horiguchi 			if (unlikely(anon_vma_prepare(vma))) {
27670fe6e20bSNaoya Horiguchi 				ret = VM_FAULT_OOM;
27680fe6e20bSNaoya Horiguchi 				goto backout_unlocked;
276923be7468SMel Gorman 			}
2770409eb8c2SHillf Danton 			anon_rmap = 1;
27710fe6e20bSNaoya Horiguchi 		}
27720fe6e20bSNaoya Horiguchi 	} else {
277357303d80SAndy Whitcroft 		/*
2774998b4382SNaoya Horiguchi 		 * If memory error occurs between mmap() and fault, some process
2775998b4382SNaoya Horiguchi 		 * don't have hwpoisoned swap entry for errored virtual address.
2776998b4382SNaoya Horiguchi 		 * So we need to block hugepage fault by PG_hwpoison bit check.
2777fd6a03edSNaoya Horiguchi 		 */
2778fd6a03edSNaoya Horiguchi 		if (unlikely(PageHWPoison(page))) {
2779aa50d3a7SAndi Kleen 			ret = VM_FAULT_HWPOISON |
2780972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
2781fd6a03edSNaoya Horiguchi 			goto backout_unlocked;
27826bda666aSChristoph Lameter 		}
2783998b4382SNaoya Horiguchi 	}
27841e8f889bSDavid Gibson 
278557303d80SAndy Whitcroft 	/*
278657303d80SAndy Whitcroft 	 * If we are going to COW a private mapping later, we examine the
278757303d80SAndy Whitcroft 	 * pending reservations for this page now. This will ensure that
278857303d80SAndy Whitcroft 	 * any allocations necessary to record that reservation occur outside
278957303d80SAndy Whitcroft 	 * the spinlock.
279057303d80SAndy Whitcroft 	 */
2791788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
27922b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
27932b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
27942b26736cSAndy Whitcroft 			goto backout_unlocked;
27952b26736cSAndy Whitcroft 		}
279657303d80SAndy Whitcroft 
2797ac9b9c66SHugh Dickins 	spin_lock(&mm->page_table_lock);
2798a5516438SAndi Kleen 	size = i_size_read(mapping->host) >> huge_page_shift(h);
27994c887265SAdam Litke 	if (idx >= size)
28004c887265SAdam Litke 		goto backout;
28014c887265SAdam Litke 
280283c54070SNick Piggin 	ret = 0;
28037f2e9525SGerald Schaefer 	if (!huge_pte_none(huge_ptep_get(ptep)))
28044c887265SAdam Litke 		goto backout;
28054c887265SAdam Litke 
280607443a85SJoonsoo Kim 	if (anon_rmap) {
280707443a85SJoonsoo Kim 		ClearPagePrivate(page);
2808409eb8c2SHillf Danton 		hugepage_add_new_anon_rmap(page, vma, address);
280907443a85SJoonsoo Kim 	}
2810409eb8c2SHillf Danton 	else
2811409eb8c2SHillf Danton 		page_dup_rmap(page);
28121e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
28131e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
28141e8f889bSDavid Gibson 	set_huge_pte_at(mm, address, ptep, new_pte);
28151e8f889bSDavid Gibson 
2816788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
28171e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
281804f2cbe3SMel Gorman 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
28191e8f889bSDavid Gibson 	}
28201e8f889bSDavid Gibson 
2821ac9b9c66SHugh Dickins 	spin_unlock(&mm->page_table_lock);
28224c887265SAdam Litke 	unlock_page(page);
28234c887265SAdam Litke out:
2824ac9b9c66SHugh Dickins 	return ret;
28254c887265SAdam Litke 
28264c887265SAdam Litke backout:
28274c887265SAdam Litke 	spin_unlock(&mm->page_table_lock);
28282b26736cSAndy Whitcroft backout_unlocked:
28294c887265SAdam Litke 	unlock_page(page);
28304c887265SAdam Litke 	put_page(page);
28314c887265SAdam Litke 	goto out;
2832ac9b9c66SHugh Dickins }
2833ac9b9c66SHugh Dickins 
283486e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2835788c7df4SHugh Dickins 			unsigned long address, unsigned int flags)
283686e5216fSAdam Litke {
283786e5216fSAdam Litke 	pte_t *ptep;
283886e5216fSAdam Litke 	pte_t entry;
28391e8f889bSDavid Gibson 	int ret;
28400fe6e20bSNaoya Horiguchi 	struct page *page = NULL;
284157303d80SAndy Whitcroft 	struct page *pagecache_page = NULL;
28423935baa9SDavid Gibson 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2843a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
284486e5216fSAdam Litke 
28451e16a539SKAMEZAWA Hiroyuki 	address &= huge_page_mask(h);
28461e16a539SKAMEZAWA Hiroyuki 
2847fd6a03edSNaoya Horiguchi 	ptep = huge_pte_offset(mm, address);
2848fd6a03edSNaoya Horiguchi 	if (ptep) {
2849fd6a03edSNaoya Horiguchi 		entry = huge_ptep_get(ptep);
2850290408d4SNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(entry))) {
285130dad309SNaoya Horiguchi 			migration_entry_wait_huge(mm, ptep);
2852290408d4SNaoya Horiguchi 			return 0;
2853290408d4SNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2854aa50d3a7SAndi Kleen 			return VM_FAULT_HWPOISON_LARGE |
2855972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
2856fd6a03edSNaoya Horiguchi 	}
2857fd6a03edSNaoya Horiguchi 
2858a5516438SAndi Kleen 	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
285986e5216fSAdam Litke 	if (!ptep)
286086e5216fSAdam Litke 		return VM_FAULT_OOM;
286186e5216fSAdam Litke 
28623935baa9SDavid Gibson 	/*
28633935baa9SDavid Gibson 	 * Serialize hugepage allocation and instantiation, so that we don't
28643935baa9SDavid Gibson 	 * get spurious allocation failures if two CPUs race to instantiate
28653935baa9SDavid Gibson 	 * the same page in the page cache.
28663935baa9SDavid Gibson 	 */
28673935baa9SDavid Gibson 	mutex_lock(&hugetlb_instantiation_mutex);
28687f2e9525SGerald Schaefer 	entry = huge_ptep_get(ptep);
28697f2e9525SGerald Schaefer 	if (huge_pte_none(entry)) {
2870788c7df4SHugh Dickins 		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2871b4d1d99fSDavid Gibson 		goto out_mutex;
28723935baa9SDavid Gibson 	}
287386e5216fSAdam Litke 
287483c54070SNick Piggin 	ret = 0;
28751e8f889bSDavid Gibson 
287657303d80SAndy Whitcroft 	/*
287757303d80SAndy Whitcroft 	 * If we are going to COW the mapping later, we examine the pending
287857303d80SAndy Whitcroft 	 * reservations for this page now. This will ensure that any
287957303d80SAndy Whitcroft 	 * allocations necessary to record that reservation occur outside the
288057303d80SAndy Whitcroft 	 * spinlock. For private mappings, we also lookup the pagecache
288157303d80SAndy Whitcroft 	 * page now as it is used to determine if a reservation has been
288257303d80SAndy Whitcroft 	 * consumed.
288357303d80SAndy Whitcroft 	 */
2884106c992aSGerald Schaefer 	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
28852b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
28862b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
2887b4d1d99fSDavid Gibson 			goto out_mutex;
28882b26736cSAndy Whitcroft 		}
288957303d80SAndy Whitcroft 
2890f83a275dSMel Gorman 		if (!(vma->vm_flags & VM_MAYSHARE))
289157303d80SAndy Whitcroft 			pagecache_page = hugetlbfs_pagecache_page(h,
289257303d80SAndy Whitcroft 								vma, address);
289357303d80SAndy Whitcroft 	}
289457303d80SAndy Whitcroft 
289556c9cfb1SNaoya Horiguchi 	/*
289656c9cfb1SNaoya Horiguchi 	 * hugetlb_cow() requires page locks of pte_page(entry) and
289756c9cfb1SNaoya Horiguchi 	 * pagecache_page, so here we need take the former one
289856c9cfb1SNaoya Horiguchi 	 * when page != pagecache_page or !pagecache_page.
289956c9cfb1SNaoya Horiguchi 	 * Note that locking order is always pagecache_page -> page,
290056c9cfb1SNaoya Horiguchi 	 * so no worry about deadlock.
290156c9cfb1SNaoya Horiguchi 	 */
29020fe6e20bSNaoya Horiguchi 	page = pte_page(entry);
290366aebce7SChris Metcalf 	get_page(page);
290456c9cfb1SNaoya Horiguchi 	if (page != pagecache_page)
29050fe6e20bSNaoya Horiguchi 		lock_page(page);
29060fe6e20bSNaoya Horiguchi 
29071e8f889bSDavid Gibson 	spin_lock(&mm->page_table_lock);
29081e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
2909b4d1d99fSDavid Gibson 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2910b4d1d99fSDavid Gibson 		goto out_page_table_lock;
2911b4d1d99fSDavid Gibson 
2912b4d1d99fSDavid Gibson 
2913788c7df4SHugh Dickins 	if (flags & FAULT_FLAG_WRITE) {
2914106c992aSGerald Schaefer 		if (!huge_pte_write(entry)) {
291557303d80SAndy Whitcroft 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
291657303d80SAndy Whitcroft 							pagecache_page);
2917b4d1d99fSDavid Gibson 			goto out_page_table_lock;
2918b4d1d99fSDavid Gibson 		}
2919106c992aSGerald Schaefer 		entry = huge_pte_mkdirty(entry);
2920b4d1d99fSDavid Gibson 	}
2921b4d1d99fSDavid Gibson 	entry = pte_mkyoung(entry);
2922788c7df4SHugh Dickins 	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2923788c7df4SHugh Dickins 						flags & FAULT_FLAG_WRITE))
29244b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
2925b4d1d99fSDavid Gibson 
2926b4d1d99fSDavid Gibson out_page_table_lock:
29271e8f889bSDavid Gibson 	spin_unlock(&mm->page_table_lock);
292857303d80SAndy Whitcroft 
292957303d80SAndy Whitcroft 	if (pagecache_page) {
293057303d80SAndy Whitcroft 		unlock_page(pagecache_page);
293157303d80SAndy Whitcroft 		put_page(pagecache_page);
293257303d80SAndy Whitcroft 	}
29331f64d69cSDean Nelson 	if (page != pagecache_page)
293456c9cfb1SNaoya Horiguchi 		unlock_page(page);
293566aebce7SChris Metcalf 	put_page(page);
293657303d80SAndy Whitcroft 
2937b4d1d99fSDavid Gibson out_mutex:
29383935baa9SDavid Gibson 	mutex_unlock(&hugetlb_instantiation_mutex);
29391e8f889bSDavid Gibson 
29401e8f889bSDavid Gibson 	return ret;
294186e5216fSAdam Litke }
294286e5216fSAdam Litke 
294328a35716SMichel Lespinasse long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
294463551ae0SDavid Gibson 			 struct page **pages, struct vm_area_struct **vmas,
294528a35716SMichel Lespinasse 			 unsigned long *position, unsigned long *nr_pages,
294628a35716SMichel Lespinasse 			 long i, unsigned int flags)
294763551ae0SDavid Gibson {
2948d5d4b0aaSChen, Kenneth W 	unsigned long pfn_offset;
2949d5d4b0aaSChen, Kenneth W 	unsigned long vaddr = *position;
295028a35716SMichel Lespinasse 	unsigned long remainder = *nr_pages;
2951a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
295263551ae0SDavid Gibson 
29531c59827dSHugh Dickins 	spin_lock(&mm->page_table_lock);
295463551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
295563551ae0SDavid Gibson 		pte_t *pte;
29562a15efc9SHugh Dickins 		int absent;
295763551ae0SDavid Gibson 		struct page *page;
295863551ae0SDavid Gibson 
29594c887265SAdam Litke 		/*
29604c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
29612a15efc9SHugh Dickins 		 * each hugepage.  We have to make sure we get the
29624c887265SAdam Litke 		 * first, for the page indexing below to work.
29634c887265SAdam Litke 		 */
2964a5516438SAndi Kleen 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
29652a15efc9SHugh Dickins 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
296663551ae0SDavid Gibson 
29672a15efc9SHugh Dickins 		/*
29682a15efc9SHugh Dickins 		 * When coredumping, it suits get_dump_page if we just return
29693ae77f43SHugh Dickins 		 * an error where there's an empty slot with no huge pagecache
29703ae77f43SHugh Dickins 		 * to back it.  This way, we avoid allocating a hugepage, and
29713ae77f43SHugh Dickins 		 * the sparse dumpfile avoids allocating disk blocks, but its
29723ae77f43SHugh Dickins 		 * huge holes still show up with zeroes where they need to be.
29732a15efc9SHugh Dickins 		 */
29743ae77f43SHugh Dickins 		if (absent && (flags & FOLL_DUMP) &&
29753ae77f43SHugh Dickins 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
29762a15efc9SHugh Dickins 			remainder = 0;
29772a15efc9SHugh Dickins 			break;
29782a15efc9SHugh Dickins 		}
29792a15efc9SHugh Dickins 
29809cc3a5bdSNaoya Horiguchi 		/*
29819cc3a5bdSNaoya Horiguchi 		 * We need call hugetlb_fault for both hugepages under migration
29829cc3a5bdSNaoya Horiguchi 		 * (in which case hugetlb_fault waits for the migration,) and
29839cc3a5bdSNaoya Horiguchi 		 * hwpoisoned hugepages (in which case we need to prevent the
29849cc3a5bdSNaoya Horiguchi 		 * caller from accessing to them.) In order to do this, we use
29859cc3a5bdSNaoya Horiguchi 		 * here is_swap_pte instead of is_hugetlb_entry_migration and
29869cc3a5bdSNaoya Horiguchi 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
29879cc3a5bdSNaoya Horiguchi 		 * both cases, and because we can't follow correct pages
29889cc3a5bdSNaoya Horiguchi 		 * directly from any kind of swap entries.
29899cc3a5bdSNaoya Horiguchi 		 */
29909cc3a5bdSNaoya Horiguchi 		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
2991106c992aSGerald Schaefer 		    ((flags & FOLL_WRITE) &&
2992106c992aSGerald Schaefer 		      !huge_pte_write(huge_ptep_get(pte)))) {
29934c887265SAdam Litke 			int ret;
29944c887265SAdam Litke 
29954c887265SAdam Litke 			spin_unlock(&mm->page_table_lock);
29962a15efc9SHugh Dickins 			ret = hugetlb_fault(mm, vma, vaddr,
29972a15efc9SHugh Dickins 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
29984c887265SAdam Litke 			spin_lock(&mm->page_table_lock);
2999a89182c7SAdam Litke 			if (!(ret & VM_FAULT_ERROR))
30004c887265SAdam Litke 				continue;
30014c887265SAdam Litke 
30021c59827dSHugh Dickins 			remainder = 0;
30031c59827dSHugh Dickins 			break;
30041c59827dSHugh Dickins 		}
300563551ae0SDavid Gibson 
3006a5516438SAndi Kleen 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
30077f2e9525SGerald Schaefer 		page = pte_page(huge_ptep_get(pte));
3008d5d4b0aaSChen, Kenneth W same_page:
3009d6692183SChen, Kenneth W 		if (pages) {
301069d177c2SAndy Whitcroft 			pages[i] = mem_map_offset(page, pfn_offset);
30114b2e38adSKOSAKI Motohiro 			get_page(pages[i]);
3012d6692183SChen, Kenneth W 		}
301363551ae0SDavid Gibson 
301463551ae0SDavid Gibson 		if (vmas)
301563551ae0SDavid Gibson 			vmas[i] = vma;
301663551ae0SDavid Gibson 
301763551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
3018d5d4b0aaSChen, Kenneth W 		++pfn_offset;
301963551ae0SDavid Gibson 		--remainder;
302063551ae0SDavid Gibson 		++i;
3021d5d4b0aaSChen, Kenneth W 		if (vaddr < vma->vm_end && remainder &&
3022a5516438SAndi Kleen 				pfn_offset < pages_per_huge_page(h)) {
3023d5d4b0aaSChen, Kenneth W 			/*
3024d5d4b0aaSChen, Kenneth W 			 * We use pfn_offset to avoid touching the pageframes
3025d5d4b0aaSChen, Kenneth W 			 * of this compound page.
3026d5d4b0aaSChen, Kenneth W 			 */
3027d5d4b0aaSChen, Kenneth W 			goto same_page;
3028d5d4b0aaSChen, Kenneth W 		}
302963551ae0SDavid Gibson 	}
30301c59827dSHugh Dickins 	spin_unlock(&mm->page_table_lock);
303128a35716SMichel Lespinasse 	*nr_pages = remainder;
303263551ae0SDavid Gibson 	*position = vaddr;
303363551ae0SDavid Gibson 
30342a15efc9SHugh Dickins 	return i ? i : -EFAULT;
303563551ae0SDavid Gibson }
30368f860591SZhang, Yanmin 
30377da4d641SPeter Zijlstra unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
30388f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
30398f860591SZhang, Yanmin {
30408f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
30418f860591SZhang, Yanmin 	unsigned long start = address;
30428f860591SZhang, Yanmin 	pte_t *ptep;
30438f860591SZhang, Yanmin 	pte_t pte;
3044a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
30457da4d641SPeter Zijlstra 	unsigned long pages = 0;
30468f860591SZhang, Yanmin 
30478f860591SZhang, Yanmin 	BUG_ON(address >= end);
30488f860591SZhang, Yanmin 	flush_cache_range(vma, address, end);
30498f860591SZhang, Yanmin 
30503d48ae45SPeter Zijlstra 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
30518f860591SZhang, Yanmin 	spin_lock(&mm->page_table_lock);
3052a5516438SAndi Kleen 	for (; address < end; address += huge_page_size(h)) {
30538f860591SZhang, Yanmin 		ptep = huge_pte_offset(mm, address);
30548f860591SZhang, Yanmin 		if (!ptep)
30558f860591SZhang, Yanmin 			continue;
30567da4d641SPeter Zijlstra 		if (huge_pmd_unshare(mm, &address, ptep)) {
30577da4d641SPeter Zijlstra 			pages++;
305839dde65cSChen, Kenneth W 			continue;
30597da4d641SPeter Zijlstra 		}
30607f2e9525SGerald Schaefer 		if (!huge_pte_none(huge_ptep_get(ptep))) {
30618f860591SZhang, Yanmin 			pte = huge_ptep_get_and_clear(mm, address, ptep);
3062106c992aSGerald Schaefer 			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3063be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
30648f860591SZhang, Yanmin 			set_huge_pte_at(mm, address, ptep, pte);
30657da4d641SPeter Zijlstra 			pages++;
30668f860591SZhang, Yanmin 		}
30678f860591SZhang, Yanmin 	}
30688f860591SZhang, Yanmin 	spin_unlock(&mm->page_table_lock);
3069d833352aSMel Gorman 	/*
3070d833352aSMel Gorman 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3071d833352aSMel Gorman 	 * may have cleared our pud entry and done put_page on the page table:
3072d833352aSMel Gorman 	 * once we release i_mmap_mutex, another task can do the final put_page
3073d833352aSMel Gorman 	 * and that page table be reused and filled with junk.
3074d833352aSMel Gorman 	 */
30758f860591SZhang, Yanmin 	flush_tlb_range(vma, start, end);
3076d833352aSMel Gorman 	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
30777da4d641SPeter Zijlstra 
30787da4d641SPeter Zijlstra 	return pages << h->order;
30798f860591SZhang, Yanmin }
30808f860591SZhang, Yanmin 
3081a1e78772SMel Gorman int hugetlb_reserve_pages(struct inode *inode,
3082a1e78772SMel Gorman 					long from, long to,
30835a6fe125SMel Gorman 					struct vm_area_struct *vma,
3084ca16d140SKOSAKI Motohiro 					vm_flags_t vm_flags)
3085e4e574b7SAdam Litke {
308617c9d12eSMel Gorman 	long ret, chg;
3087a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
308890481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
3089e4e574b7SAdam Litke 
3090a1e78772SMel Gorman 	/*
309117c9d12eSMel Gorman 	 * Only apply hugepage reservation if asked. At fault time, an
309217c9d12eSMel Gorman 	 * attempt will be made for VM_NORESERVE to allocate a page
309390481622SDavid Gibson 	 * without using reserves
309417c9d12eSMel Gorman 	 */
3095ca16d140SKOSAKI Motohiro 	if (vm_flags & VM_NORESERVE)
309617c9d12eSMel Gorman 		return 0;
309717c9d12eSMel Gorman 
309817c9d12eSMel Gorman 	/*
3099a1e78772SMel Gorman 	 * Shared mappings base their reservation on the number of pages that
3100a1e78772SMel Gorman 	 * are already allocated on behalf of the file. Private mappings need
3101a1e78772SMel Gorman 	 * to reserve the full area even if read-only as mprotect() may be
3102a1e78772SMel Gorman 	 * called to make the mapping read-write. Assume !vma is a shm mapping
3103a1e78772SMel Gorman 	 */
3104f83a275dSMel Gorman 	if (!vma || vma->vm_flags & VM_MAYSHARE)
3105e4e574b7SAdam Litke 		chg = region_chg(&inode->i_mapping->private_list, from, to);
31065a6fe125SMel Gorman 	else {
31075a6fe125SMel Gorman 		struct resv_map *resv_map = resv_map_alloc();
31085a6fe125SMel Gorman 		if (!resv_map)
31095a6fe125SMel Gorman 			return -ENOMEM;
31105a6fe125SMel Gorman 
311117c9d12eSMel Gorman 		chg = to - from;
311217c9d12eSMel Gorman 
31135a6fe125SMel Gorman 		set_vma_resv_map(vma, resv_map);
31145a6fe125SMel Gorman 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
31155a6fe125SMel Gorman 	}
31165a6fe125SMel Gorman 
3117c50ac050SDave Hansen 	if (chg < 0) {
3118c50ac050SDave Hansen 		ret = chg;
3119c50ac050SDave Hansen 		goto out_err;
3120c50ac050SDave Hansen 	}
312117c9d12eSMel Gorman 
312290481622SDavid Gibson 	/* There must be enough pages in the subpool for the mapping */
3123c50ac050SDave Hansen 	if (hugepage_subpool_get_pages(spool, chg)) {
3124c50ac050SDave Hansen 		ret = -ENOSPC;
3125c50ac050SDave Hansen 		goto out_err;
3126c50ac050SDave Hansen 	}
312717c9d12eSMel Gorman 
312817c9d12eSMel Gorman 	/*
312917c9d12eSMel Gorman 	 * Check enough hugepages are available for the reservation.
313090481622SDavid Gibson 	 * Hand the pages back to the subpool if there are not
313117c9d12eSMel Gorman 	 */
313217c9d12eSMel Gorman 	ret = hugetlb_acct_memory(h, chg);
313317c9d12eSMel Gorman 	if (ret < 0) {
313490481622SDavid Gibson 		hugepage_subpool_put_pages(spool, chg);
3135c50ac050SDave Hansen 		goto out_err;
313617c9d12eSMel Gorman 	}
313717c9d12eSMel Gorman 
313817c9d12eSMel Gorman 	/*
313917c9d12eSMel Gorman 	 * Account for the reservations made. Shared mappings record regions
314017c9d12eSMel Gorman 	 * that have reservations as they are shared by multiple VMAs.
314117c9d12eSMel Gorman 	 * When the last VMA disappears, the region map says how much
314217c9d12eSMel Gorman 	 * the reservation was and the page cache tells how much of
314317c9d12eSMel Gorman 	 * the reservation was consumed. Private mappings are per-VMA and
314417c9d12eSMel Gorman 	 * only the consumed reservations are tracked. When the VMA
314517c9d12eSMel Gorman 	 * disappears, the original reservation is the VMA size and the
314617c9d12eSMel Gorman 	 * consumed reservations are stored in the map. Hence, nothing
314717c9d12eSMel Gorman 	 * else has to be done for private mappings here
314817c9d12eSMel Gorman 	 */
3149f83a275dSMel Gorman 	if (!vma || vma->vm_flags & VM_MAYSHARE)
315017c9d12eSMel Gorman 		region_add(&inode->i_mapping->private_list, from, to);
3151a43a8c39SChen, Kenneth W 	return 0;
3152c50ac050SDave Hansen out_err:
31534523e145SDave Hansen 	if (vma)
3154c50ac050SDave Hansen 		resv_map_put(vma);
3155c50ac050SDave Hansen 	return ret;
3156a43a8c39SChen, Kenneth W }
3157a43a8c39SChen, Kenneth W 
3158a43a8c39SChen, Kenneth W void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3159a43a8c39SChen, Kenneth W {
3160a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
3161a43a8c39SChen, Kenneth W 	long chg = region_truncate(&inode->i_mapping->private_list, offset);
316290481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
316345c682a6SKen Chen 
316445c682a6SKen Chen 	spin_lock(&inode->i_lock);
3165e4c6f8beSEric Sandeen 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
316645c682a6SKen Chen 	spin_unlock(&inode->i_lock);
316745c682a6SKen Chen 
316890481622SDavid Gibson 	hugepage_subpool_put_pages(spool, (chg - freed));
3169a5516438SAndi Kleen 	hugetlb_acct_memory(h, -(chg - freed));
3170a43a8c39SChen, Kenneth W }
317193f70f90SNaoya Horiguchi 
31723212b535SSteve Capper #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
31733212b535SSteve Capper static unsigned long page_table_shareable(struct vm_area_struct *svma,
31743212b535SSteve Capper 				struct vm_area_struct *vma,
31753212b535SSteve Capper 				unsigned long addr, pgoff_t idx)
31763212b535SSteve Capper {
31773212b535SSteve Capper 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
31783212b535SSteve Capper 				svma->vm_start;
31793212b535SSteve Capper 	unsigned long sbase = saddr & PUD_MASK;
31803212b535SSteve Capper 	unsigned long s_end = sbase + PUD_SIZE;
31813212b535SSteve Capper 
31823212b535SSteve Capper 	/* Allow segments to share if only one is marked locked */
31833212b535SSteve Capper 	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
31843212b535SSteve Capper 	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
31853212b535SSteve Capper 
31863212b535SSteve Capper 	/*
31873212b535SSteve Capper 	 * match the virtual addresses, permission and the alignment of the
31883212b535SSteve Capper 	 * page table page.
31893212b535SSteve Capper 	 */
31903212b535SSteve Capper 	if (pmd_index(addr) != pmd_index(saddr) ||
31913212b535SSteve Capper 	    vm_flags != svm_flags ||
31923212b535SSteve Capper 	    sbase < svma->vm_start || svma->vm_end < s_end)
31933212b535SSteve Capper 		return 0;
31943212b535SSteve Capper 
31953212b535SSteve Capper 	return saddr;
31963212b535SSteve Capper }
31973212b535SSteve Capper 
31983212b535SSteve Capper static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
31993212b535SSteve Capper {
32003212b535SSteve Capper 	unsigned long base = addr & PUD_MASK;
32013212b535SSteve Capper 	unsigned long end = base + PUD_SIZE;
32023212b535SSteve Capper 
32033212b535SSteve Capper 	/*
32043212b535SSteve Capper 	 * check on proper vm_flags and page table alignment
32053212b535SSteve Capper 	 */
32063212b535SSteve Capper 	if (vma->vm_flags & VM_MAYSHARE &&
32073212b535SSteve Capper 	    vma->vm_start <= base && end <= vma->vm_end)
32083212b535SSteve Capper 		return 1;
32093212b535SSteve Capper 	return 0;
32103212b535SSteve Capper }
32113212b535SSteve Capper 
32123212b535SSteve Capper /*
32133212b535SSteve Capper  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
32143212b535SSteve Capper  * and returns the corresponding pte. While this is not necessary for the
32153212b535SSteve Capper  * !shared pmd case because we can allocate the pmd later as well, it makes the
32163212b535SSteve Capper  * code much cleaner. pmd allocation is essential for the shared case because
32173212b535SSteve Capper  * pud has to be populated inside the same i_mmap_mutex section - otherwise
32183212b535SSteve Capper  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
32193212b535SSteve Capper  * bad pmd for sharing.
32203212b535SSteve Capper  */
32213212b535SSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
32223212b535SSteve Capper {
32233212b535SSteve Capper 	struct vm_area_struct *vma = find_vma(mm, addr);
32243212b535SSteve Capper 	struct address_space *mapping = vma->vm_file->f_mapping;
32253212b535SSteve Capper 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
32263212b535SSteve Capper 			vma->vm_pgoff;
32273212b535SSteve Capper 	struct vm_area_struct *svma;
32283212b535SSteve Capper 	unsigned long saddr;
32293212b535SSteve Capper 	pte_t *spte = NULL;
32303212b535SSteve Capper 	pte_t *pte;
32313212b535SSteve Capper 
32323212b535SSteve Capper 	if (!vma_shareable(vma, addr))
32333212b535SSteve Capper 		return (pte_t *)pmd_alloc(mm, pud, addr);
32343212b535SSteve Capper 
32353212b535SSteve Capper 	mutex_lock(&mapping->i_mmap_mutex);
32363212b535SSteve Capper 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
32373212b535SSteve Capper 		if (svma == vma)
32383212b535SSteve Capper 			continue;
32393212b535SSteve Capper 
32403212b535SSteve Capper 		saddr = page_table_shareable(svma, vma, addr, idx);
32413212b535SSteve Capper 		if (saddr) {
32423212b535SSteve Capper 			spte = huge_pte_offset(svma->vm_mm, saddr);
32433212b535SSteve Capper 			if (spte) {
32443212b535SSteve Capper 				get_page(virt_to_page(spte));
32453212b535SSteve Capper 				break;
32463212b535SSteve Capper 			}
32473212b535SSteve Capper 		}
32483212b535SSteve Capper 	}
32493212b535SSteve Capper 
32503212b535SSteve Capper 	if (!spte)
32513212b535SSteve Capper 		goto out;
32523212b535SSteve Capper 
32533212b535SSteve Capper 	spin_lock(&mm->page_table_lock);
32543212b535SSteve Capper 	if (pud_none(*pud))
32553212b535SSteve Capper 		pud_populate(mm, pud,
32563212b535SSteve Capper 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
32573212b535SSteve Capper 	else
32583212b535SSteve Capper 		put_page(virt_to_page(spte));
32593212b535SSteve Capper 	spin_unlock(&mm->page_table_lock);
32603212b535SSteve Capper out:
32613212b535SSteve Capper 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
32623212b535SSteve Capper 	mutex_unlock(&mapping->i_mmap_mutex);
32633212b535SSteve Capper 	return pte;
32643212b535SSteve Capper }
32653212b535SSteve Capper 
32663212b535SSteve Capper /*
32673212b535SSteve Capper  * unmap huge page backed by shared pte.
32683212b535SSteve Capper  *
32693212b535SSteve Capper  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
32703212b535SSteve Capper  * indicated by page_count > 1, unmap is achieved by clearing pud and
32713212b535SSteve Capper  * decrementing the ref count. If count == 1, the pte page is not shared.
32723212b535SSteve Capper  *
32733212b535SSteve Capper  * called with vma->vm_mm->page_table_lock held.
32743212b535SSteve Capper  *
32753212b535SSteve Capper  * returns: 1 successfully unmapped a shared pte page
32763212b535SSteve Capper  *	    0 the underlying pte page is not shared, or it is the last user
32773212b535SSteve Capper  */
32783212b535SSteve Capper int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
32793212b535SSteve Capper {
32803212b535SSteve Capper 	pgd_t *pgd = pgd_offset(mm, *addr);
32813212b535SSteve Capper 	pud_t *pud = pud_offset(pgd, *addr);
32823212b535SSteve Capper 
32833212b535SSteve Capper 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
32843212b535SSteve Capper 	if (page_count(virt_to_page(ptep)) == 1)
32853212b535SSteve Capper 		return 0;
32863212b535SSteve Capper 
32873212b535SSteve Capper 	pud_clear(pud);
32883212b535SSteve Capper 	put_page(virt_to_page(ptep));
32893212b535SSteve Capper 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
32903212b535SSteve Capper 	return 1;
32913212b535SSteve Capper }
32929e5fc74cSSteve Capper #define want_pmd_share()	(1)
32939e5fc74cSSteve Capper #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
32949e5fc74cSSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
32959e5fc74cSSteve Capper {
32969e5fc74cSSteve Capper 	return NULL;
32979e5fc74cSSteve Capper }
32989e5fc74cSSteve Capper #define want_pmd_share()	(0)
32993212b535SSteve Capper #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
33003212b535SSteve Capper 
33019e5fc74cSSteve Capper #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
33029e5fc74cSSteve Capper pte_t *huge_pte_alloc(struct mm_struct *mm,
33039e5fc74cSSteve Capper 			unsigned long addr, unsigned long sz)
33049e5fc74cSSteve Capper {
33059e5fc74cSSteve Capper 	pgd_t *pgd;
33069e5fc74cSSteve Capper 	pud_t *pud;
33079e5fc74cSSteve Capper 	pte_t *pte = NULL;
33089e5fc74cSSteve Capper 
33099e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
33109e5fc74cSSteve Capper 	pud = pud_alloc(mm, pgd, addr);
33119e5fc74cSSteve Capper 	if (pud) {
33129e5fc74cSSteve Capper 		if (sz == PUD_SIZE) {
33139e5fc74cSSteve Capper 			pte = (pte_t *)pud;
33149e5fc74cSSteve Capper 		} else {
33159e5fc74cSSteve Capper 			BUG_ON(sz != PMD_SIZE);
33169e5fc74cSSteve Capper 			if (want_pmd_share() && pud_none(*pud))
33179e5fc74cSSteve Capper 				pte = huge_pmd_share(mm, addr, pud);
33189e5fc74cSSteve Capper 			else
33199e5fc74cSSteve Capper 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
33209e5fc74cSSteve Capper 		}
33219e5fc74cSSteve Capper 	}
33229e5fc74cSSteve Capper 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
33239e5fc74cSSteve Capper 
33249e5fc74cSSteve Capper 	return pte;
33259e5fc74cSSteve Capper }
33269e5fc74cSSteve Capper 
33279e5fc74cSSteve Capper pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
33289e5fc74cSSteve Capper {
33299e5fc74cSSteve Capper 	pgd_t *pgd;
33309e5fc74cSSteve Capper 	pud_t *pud;
33319e5fc74cSSteve Capper 	pmd_t *pmd = NULL;
33329e5fc74cSSteve Capper 
33339e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
33349e5fc74cSSteve Capper 	if (pgd_present(*pgd)) {
33359e5fc74cSSteve Capper 		pud = pud_offset(pgd, addr);
33369e5fc74cSSteve Capper 		if (pud_present(*pud)) {
33379e5fc74cSSteve Capper 			if (pud_huge(*pud))
33389e5fc74cSSteve Capper 				return (pte_t *)pud;
33399e5fc74cSSteve Capper 			pmd = pmd_offset(pud, addr);
33409e5fc74cSSteve Capper 		}
33419e5fc74cSSteve Capper 	}
33429e5fc74cSSteve Capper 	return (pte_t *) pmd;
33439e5fc74cSSteve Capper }
33449e5fc74cSSteve Capper 
33459e5fc74cSSteve Capper struct page *
33469e5fc74cSSteve Capper follow_huge_pmd(struct mm_struct *mm, unsigned long address,
33479e5fc74cSSteve Capper 		pmd_t *pmd, int write)
33489e5fc74cSSteve Capper {
33499e5fc74cSSteve Capper 	struct page *page;
33509e5fc74cSSteve Capper 
33519e5fc74cSSteve Capper 	page = pte_page(*(pte_t *)pmd);
33529e5fc74cSSteve Capper 	if (page)
33539e5fc74cSSteve Capper 		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
33549e5fc74cSSteve Capper 	return page;
33559e5fc74cSSteve Capper }
33569e5fc74cSSteve Capper 
33579e5fc74cSSteve Capper struct page *
33589e5fc74cSSteve Capper follow_huge_pud(struct mm_struct *mm, unsigned long address,
33599e5fc74cSSteve Capper 		pud_t *pud, int write)
33609e5fc74cSSteve Capper {
33619e5fc74cSSteve Capper 	struct page *page;
33629e5fc74cSSteve Capper 
33639e5fc74cSSteve Capper 	page = pte_page(*(pte_t *)pud);
33649e5fc74cSSteve Capper 	if (page)
33659e5fc74cSSteve Capper 		page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
33669e5fc74cSSteve Capper 	return page;
33679e5fc74cSSteve Capper }
33689e5fc74cSSteve Capper 
33699e5fc74cSSteve Capper #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
33709e5fc74cSSteve Capper 
33719e5fc74cSSteve Capper /* Can be overriden by architectures */
33729e5fc74cSSteve Capper __attribute__((weak)) struct page *
33739e5fc74cSSteve Capper follow_huge_pud(struct mm_struct *mm, unsigned long address,
33749e5fc74cSSteve Capper 	       pud_t *pud, int write)
33759e5fc74cSSteve Capper {
33769e5fc74cSSteve Capper 	BUG();
33779e5fc74cSSteve Capper 	return NULL;
33789e5fc74cSSteve Capper }
33799e5fc74cSSteve Capper 
33809e5fc74cSSteve Capper #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
33819e5fc74cSSteve Capper 
3382d5bd9106SAndi Kleen #ifdef CONFIG_MEMORY_FAILURE
3383d5bd9106SAndi Kleen 
33846de2b1aaSNaoya Horiguchi /* Should be called in hugetlb_lock */
33856de2b1aaSNaoya Horiguchi static int is_hugepage_on_freelist(struct page *hpage)
33866de2b1aaSNaoya Horiguchi {
33876de2b1aaSNaoya Horiguchi 	struct page *page;
33886de2b1aaSNaoya Horiguchi 	struct page *tmp;
33896de2b1aaSNaoya Horiguchi 	struct hstate *h = page_hstate(hpage);
33906de2b1aaSNaoya Horiguchi 	int nid = page_to_nid(hpage);
33916de2b1aaSNaoya Horiguchi 
33926de2b1aaSNaoya Horiguchi 	list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
33936de2b1aaSNaoya Horiguchi 		if (page == hpage)
33946de2b1aaSNaoya Horiguchi 			return 1;
33956de2b1aaSNaoya Horiguchi 	return 0;
33966de2b1aaSNaoya Horiguchi }
33976de2b1aaSNaoya Horiguchi 
339893f70f90SNaoya Horiguchi /*
339993f70f90SNaoya Horiguchi  * This function is called from memory failure code.
340093f70f90SNaoya Horiguchi  * Assume the caller holds page lock of the head page.
340193f70f90SNaoya Horiguchi  */
34026de2b1aaSNaoya Horiguchi int dequeue_hwpoisoned_huge_page(struct page *hpage)
340393f70f90SNaoya Horiguchi {
340493f70f90SNaoya Horiguchi 	struct hstate *h = page_hstate(hpage);
340593f70f90SNaoya Horiguchi 	int nid = page_to_nid(hpage);
34066de2b1aaSNaoya Horiguchi 	int ret = -EBUSY;
340793f70f90SNaoya Horiguchi 
340893f70f90SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
34096de2b1aaSNaoya Horiguchi 	if (is_hugepage_on_freelist(hpage)) {
341056f2fb14SNaoya Horiguchi 		/*
341156f2fb14SNaoya Horiguchi 		 * Hwpoisoned hugepage isn't linked to activelist or freelist,
341256f2fb14SNaoya Horiguchi 		 * but dangling hpage->lru can trigger list-debug warnings
341356f2fb14SNaoya Horiguchi 		 * (this happens when we call unpoison_memory() on it),
341456f2fb14SNaoya Horiguchi 		 * so let it point to itself with list_del_init().
341556f2fb14SNaoya Horiguchi 		 */
341656f2fb14SNaoya Horiguchi 		list_del_init(&hpage->lru);
34178c6c2ecbSNaoya Horiguchi 		set_page_refcounted(hpage);
341893f70f90SNaoya Horiguchi 		h->free_huge_pages--;
341993f70f90SNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
34206de2b1aaSNaoya Horiguchi 		ret = 0;
342193f70f90SNaoya Horiguchi 	}
34226de2b1aaSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
34236de2b1aaSNaoya Horiguchi 	return ret;
34246de2b1aaSNaoya Horiguchi }
34256de2b1aaSNaoya Horiguchi #endif
342631caf665SNaoya Horiguchi 
342731caf665SNaoya Horiguchi bool isolate_huge_page(struct page *page, struct list_head *list)
342831caf665SNaoya Horiguchi {
342931caf665SNaoya Horiguchi 	VM_BUG_ON(!PageHead(page));
343031caf665SNaoya Horiguchi 	if (!get_page_unless_zero(page))
343131caf665SNaoya Horiguchi 		return false;
343231caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
343331caf665SNaoya Horiguchi 	list_move_tail(&page->lru, list);
343431caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
343531caf665SNaoya Horiguchi 	return true;
343631caf665SNaoya Horiguchi }
343731caf665SNaoya Horiguchi 
343831caf665SNaoya Horiguchi void putback_active_hugepage(struct page *page)
343931caf665SNaoya Horiguchi {
344031caf665SNaoya Horiguchi 	VM_BUG_ON(!PageHead(page));
344131caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
344231caf665SNaoya Horiguchi 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
344331caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
344431caf665SNaoya Horiguchi 	put_page(page);
344531caf665SNaoya Horiguchi }
3446