xref: /openbmc/linux/mm/hugetlb.c (revision 480eccf9)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
31da177e4SLinus Torvalds  * (C) William Irwin, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/gfp.h>
61da177e4SLinus Torvalds #include <linux/list.h>
71da177e4SLinus Torvalds #include <linux/init.h>
81da177e4SLinus Torvalds #include <linux/module.h>
91da177e4SLinus Torvalds #include <linux/mm.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
121da177e4SLinus Torvalds #include <linux/nodemask.h>
1363551ae0SDavid Gibson #include <linux/pagemap.h>
145da7ca86SChristoph Lameter #include <linux/mempolicy.h>
15aea47ff3SChristoph Lameter #include <linux/cpuset.h>
163935baa9SDavid Gibson #include <linux/mutex.h>
175da7ca86SChristoph Lameter 
1863551ae0SDavid Gibson #include <asm/page.h>
1963551ae0SDavid Gibson #include <asm/pgtable.h>
2063551ae0SDavid Gibson 
2163551ae0SDavid Gibson #include <linux/hugetlb.h>
227835e98bSNick Piggin #include "internal.h"
231da177e4SLinus Torvalds 
241da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25a43a8c39SChen, Kenneth W static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
261da177e4SLinus Torvalds unsigned long max_huge_pages;
271da177e4SLinus Torvalds static struct list_head hugepage_freelists[MAX_NUMNODES];
281da177e4SLinus Torvalds static unsigned int nr_huge_pages_node[MAX_NUMNODES];
291da177e4SLinus Torvalds static unsigned int free_huge_pages_node[MAX_NUMNODES];
30396faf03SMel Gorman static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31396faf03SMel Gorman unsigned long hugepages_treat_as_movable;
32396faf03SMel Gorman 
333935baa9SDavid Gibson /*
343935baa9SDavid Gibson  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
353935baa9SDavid Gibson  */
363935baa9SDavid Gibson static DEFINE_SPINLOCK(hugetlb_lock);
370bd0f9fbSEric Paris 
3879ac6ba4SDavid Gibson static void clear_huge_page(struct page *page, unsigned long addr)
3979ac6ba4SDavid Gibson {
4079ac6ba4SDavid Gibson 	int i;
4179ac6ba4SDavid Gibson 
4279ac6ba4SDavid Gibson 	might_sleep();
4379ac6ba4SDavid Gibson 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
4479ac6ba4SDavid Gibson 		cond_resched();
4579ac6ba4SDavid Gibson 		clear_user_highpage(page + i, addr);
4679ac6ba4SDavid Gibson 	}
4779ac6ba4SDavid Gibson }
4879ac6ba4SDavid Gibson 
4979ac6ba4SDavid Gibson static void copy_huge_page(struct page *dst, struct page *src,
509de455b2SAtsushi Nemoto 			   unsigned long addr, struct vm_area_struct *vma)
5179ac6ba4SDavid Gibson {
5279ac6ba4SDavid Gibson 	int i;
5379ac6ba4SDavid Gibson 
5479ac6ba4SDavid Gibson 	might_sleep();
5579ac6ba4SDavid Gibson 	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
5679ac6ba4SDavid Gibson 		cond_resched();
579de455b2SAtsushi Nemoto 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
5879ac6ba4SDavid Gibson 	}
5979ac6ba4SDavid Gibson }
6079ac6ba4SDavid Gibson 
611da177e4SLinus Torvalds static void enqueue_huge_page(struct page *page)
621da177e4SLinus Torvalds {
631da177e4SLinus Torvalds 	int nid = page_to_nid(page);
641da177e4SLinus Torvalds 	list_add(&page->lru, &hugepage_freelists[nid]);
651da177e4SLinus Torvalds 	free_huge_pages++;
661da177e4SLinus Torvalds 	free_huge_pages_node[nid]++;
671da177e4SLinus Torvalds }
681da177e4SLinus Torvalds 
695da7ca86SChristoph Lameter static struct page *dequeue_huge_page(struct vm_area_struct *vma,
705da7ca86SChristoph Lameter 				unsigned long address)
711da177e4SLinus Torvalds {
7231a5c6e4SNishanth Aravamudan 	int nid;
731da177e4SLinus Torvalds 	struct page *page = NULL;
74480eccf9SLee Schermerhorn 	struct mempolicy *mpol;
75396faf03SMel Gorman 	struct zonelist *zonelist = huge_zonelist(vma, address,
76480eccf9SLee Schermerhorn 					htlb_alloc_mask, &mpol);
7796df9333SChristoph Lameter 	struct zone **z;
781da177e4SLinus Torvalds 
7996df9333SChristoph Lameter 	for (z = zonelist->zones; *z; z++) {
8089fa3024SChristoph Lameter 		nid = zone_to_nid(*z);
81396faf03SMel Gorman 		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
823abf7afdSAndrew Morton 		    !list_empty(&hugepage_freelists[nid])) {
831da177e4SLinus Torvalds 			page = list_entry(hugepage_freelists[nid].next,
841da177e4SLinus Torvalds 					  struct page, lru);
851da177e4SLinus Torvalds 			list_del(&page->lru);
861da177e4SLinus Torvalds 			free_huge_pages--;
871da177e4SLinus Torvalds 			free_huge_pages_node[nid]--;
885ab3ee7bSKen Chen 			break;
891da177e4SLinus Torvalds 		}
903abf7afdSAndrew Morton 	}
91480eccf9SLee Schermerhorn 	mpol_free(mpol);	/* unref if mpol !NULL */
921da177e4SLinus Torvalds 	return page;
931da177e4SLinus Torvalds }
941da177e4SLinus Torvalds 
9527a85ef1SDavid Gibson static void free_huge_page(struct page *page)
9627a85ef1SDavid Gibson {
9727a85ef1SDavid Gibson 	BUG_ON(page_count(page));
9827a85ef1SDavid Gibson 
9927a85ef1SDavid Gibson 	INIT_LIST_HEAD(&page->lru);
10027a85ef1SDavid Gibson 
10127a85ef1SDavid Gibson 	spin_lock(&hugetlb_lock);
10227a85ef1SDavid Gibson 	enqueue_huge_page(page);
10327a85ef1SDavid Gibson 	spin_unlock(&hugetlb_lock);
10427a85ef1SDavid Gibson }
10527a85ef1SDavid Gibson 
106a482289dSNick Piggin static int alloc_fresh_huge_page(void)
1071da177e4SLinus Torvalds {
108f96efd58SJoe Jin 	static int prev_nid;
1091da177e4SLinus Torvalds 	struct page *page;
110f96efd58SJoe Jin 	int nid;
111f96efd58SJoe Jin 
1127ed5cb2bSHugh Dickins 	/*
1137ed5cb2bSHugh Dickins 	 * Copy static prev_nid to local nid, work on that, then copy it
1147ed5cb2bSHugh Dickins 	 * back to prev_nid afterwards: otherwise there's a window in which
1157ed5cb2bSHugh Dickins 	 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
1167ed5cb2bSHugh Dickins 	 * But we don't need to use a spin_lock here: it really doesn't
1177ed5cb2bSHugh Dickins 	 * matter if occasionally a racer chooses the same nid as we do.
1187ed5cb2bSHugh Dickins 	 */
119f96efd58SJoe Jin 	nid = next_node(prev_nid, node_online_map);
120fdb7cc59SPaul Jackson 	if (nid == MAX_NUMNODES)
121fdb7cc59SPaul Jackson 		nid = first_node(node_online_map);
122f96efd58SJoe Jin 	prev_nid = nid;
123f96efd58SJoe Jin 
124396faf03SMel Gorman 	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
125f96efd58SJoe Jin 					HUGETLB_PAGE_ORDER);
1261da177e4SLinus Torvalds 	if (page) {
12733f2ef89SAndy Whitcroft 		set_compound_page_dtor(page, free_huge_page);
1280bd0f9fbSEric Paris 		spin_lock(&hugetlb_lock);
1291da177e4SLinus Torvalds 		nr_huge_pages++;
1301da177e4SLinus Torvalds 		nr_huge_pages_node[page_to_nid(page)]++;
1310bd0f9fbSEric Paris 		spin_unlock(&hugetlb_lock);
132a482289dSNick Piggin 		put_page(page); /* free it into the hugepage allocator */
133a482289dSNick Piggin 		return 1;
1341da177e4SLinus Torvalds 	}
135a482289dSNick Piggin 	return 0;
1361da177e4SLinus Torvalds }
1371da177e4SLinus Torvalds 
13827a85ef1SDavid Gibson static struct page *alloc_huge_page(struct vm_area_struct *vma,
13927a85ef1SDavid Gibson 				    unsigned long addr)
1401da177e4SLinus Torvalds {
1411da177e4SLinus Torvalds 	struct page *page;
1421da177e4SLinus Torvalds 
1431da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
144a43a8c39SChen, Kenneth W 	if (vma->vm_flags & VM_MAYSHARE)
145a43a8c39SChen, Kenneth W 		resv_huge_pages--;
146a43a8c39SChen, Kenneth W 	else if (free_huge_pages <= resv_huge_pages)
147b45b5bd6SDavid Gibson 		goto fail;
148b45b5bd6SDavid Gibson 
149b45b5bd6SDavid Gibson 	page = dequeue_huge_page(vma, addr);
150b45b5bd6SDavid Gibson 	if (!page)
151b45b5bd6SDavid Gibson 		goto fail;
152b45b5bd6SDavid Gibson 
1531da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
1547835e98bSNick Piggin 	set_page_refcounted(page);
1551da177e4SLinus Torvalds 	return page;
156b45b5bd6SDavid Gibson 
157b45b5bd6SDavid Gibson fail:
158ace4bd29SKen Chen 	if (vma->vm_flags & VM_MAYSHARE)
159ace4bd29SKen Chen 		resv_huge_pages++;
160b45b5bd6SDavid Gibson 	spin_unlock(&hugetlb_lock);
161b45b5bd6SDavid Gibson 	return NULL;
162b45b5bd6SDavid Gibson }
163b45b5bd6SDavid Gibson 
1641da177e4SLinus Torvalds static int __init hugetlb_init(void)
1651da177e4SLinus Torvalds {
1661da177e4SLinus Torvalds 	unsigned long i;
1671da177e4SLinus Torvalds 
1683c726f8dSBenjamin Herrenschmidt 	if (HPAGE_SHIFT == 0)
1693c726f8dSBenjamin Herrenschmidt 		return 0;
1703c726f8dSBenjamin Herrenschmidt 
1711da177e4SLinus Torvalds 	for (i = 0; i < MAX_NUMNODES; ++i)
1721da177e4SLinus Torvalds 		INIT_LIST_HEAD(&hugepage_freelists[i]);
1731da177e4SLinus Torvalds 
1741da177e4SLinus Torvalds 	for (i = 0; i < max_huge_pages; ++i) {
175a482289dSNick Piggin 		if (!alloc_fresh_huge_page())
1761da177e4SLinus Torvalds 			break;
1771da177e4SLinus Torvalds 	}
1781da177e4SLinus Torvalds 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
1791da177e4SLinus Torvalds 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
1801da177e4SLinus Torvalds 	return 0;
1811da177e4SLinus Torvalds }
1821da177e4SLinus Torvalds module_init(hugetlb_init);
1831da177e4SLinus Torvalds 
1841da177e4SLinus Torvalds static int __init hugetlb_setup(char *s)
1851da177e4SLinus Torvalds {
1861da177e4SLinus Torvalds 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
1871da177e4SLinus Torvalds 		max_huge_pages = 0;
1881da177e4SLinus Torvalds 	return 1;
1891da177e4SLinus Torvalds }
1901da177e4SLinus Torvalds __setup("hugepages=", hugetlb_setup);
1911da177e4SLinus Torvalds 
1928a630112SKen Chen static unsigned int cpuset_mems_nr(unsigned int *array)
1938a630112SKen Chen {
1948a630112SKen Chen 	int node;
1958a630112SKen Chen 	unsigned int nr = 0;
1968a630112SKen Chen 
1978a630112SKen Chen 	for_each_node_mask(node, cpuset_current_mems_allowed)
1988a630112SKen Chen 		nr += array[node];
1998a630112SKen Chen 
2008a630112SKen Chen 	return nr;
2018a630112SKen Chen }
2028a630112SKen Chen 
2031da177e4SLinus Torvalds #ifdef CONFIG_SYSCTL
2041da177e4SLinus Torvalds static void update_and_free_page(struct page *page)
2051da177e4SLinus Torvalds {
2061da177e4SLinus Torvalds 	int i;
2071da177e4SLinus Torvalds 	nr_huge_pages--;
2084415cc8dSChristoph Lameter 	nr_huge_pages_node[page_to_nid(page)]--;
2091da177e4SLinus Torvalds 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
2101da177e4SLinus Torvalds 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
2111da177e4SLinus Torvalds 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
2121da177e4SLinus Torvalds 				1 << PG_private | 1<< PG_writeback);
2131da177e4SLinus Torvalds 	}
214f8af0bb8SAkinobu Mita 	set_compound_page_dtor(page, NULL);
2157835e98bSNick Piggin 	set_page_refcounted(page);
2161da177e4SLinus Torvalds 	__free_pages(page, HUGETLB_PAGE_ORDER);
2171da177e4SLinus Torvalds }
2181da177e4SLinus Torvalds 
2191da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
2201da177e4SLinus Torvalds static void try_to_free_low(unsigned long count)
2211da177e4SLinus Torvalds {
2224415cc8dSChristoph Lameter 	int i;
2234415cc8dSChristoph Lameter 
2241da177e4SLinus Torvalds 	for (i = 0; i < MAX_NUMNODES; ++i) {
2251da177e4SLinus Torvalds 		struct page *page, *next;
2261da177e4SLinus Torvalds 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
2271da177e4SLinus Torvalds 			if (PageHighMem(page))
2281da177e4SLinus Torvalds 				continue;
2291da177e4SLinus Torvalds 			list_del(&page->lru);
2301da177e4SLinus Torvalds 			update_and_free_page(page);
2311da177e4SLinus Torvalds 			free_huge_pages--;
2324415cc8dSChristoph Lameter 			free_huge_pages_node[page_to_nid(page)]--;
2331da177e4SLinus Torvalds 			if (count >= nr_huge_pages)
2341da177e4SLinus Torvalds 				return;
2351da177e4SLinus Torvalds 		}
2361da177e4SLinus Torvalds 	}
2371da177e4SLinus Torvalds }
2381da177e4SLinus Torvalds #else
2391da177e4SLinus Torvalds static inline void try_to_free_low(unsigned long count)
2401da177e4SLinus Torvalds {
2411da177e4SLinus Torvalds }
2421da177e4SLinus Torvalds #endif
2431da177e4SLinus Torvalds 
2441da177e4SLinus Torvalds static unsigned long set_max_huge_pages(unsigned long count)
2451da177e4SLinus Torvalds {
2461da177e4SLinus Torvalds 	while (count > nr_huge_pages) {
247a482289dSNick Piggin 		if (!alloc_fresh_huge_page())
2481da177e4SLinus Torvalds 			return nr_huge_pages;
2491da177e4SLinus Torvalds 	}
2501da177e4SLinus Torvalds 	if (count >= nr_huge_pages)
2511da177e4SLinus Torvalds 		return nr_huge_pages;
2521da177e4SLinus Torvalds 
2531da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
254a43a8c39SChen, Kenneth W 	count = max(count, resv_huge_pages);
2551da177e4SLinus Torvalds 	try_to_free_low(count);
2561da177e4SLinus Torvalds 	while (count < nr_huge_pages) {
2575da7ca86SChristoph Lameter 		struct page *page = dequeue_huge_page(NULL, 0);
2581da177e4SLinus Torvalds 		if (!page)
2591da177e4SLinus Torvalds 			break;
2601da177e4SLinus Torvalds 		update_and_free_page(page);
2611da177e4SLinus Torvalds 	}
2621da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
2631da177e4SLinus Torvalds 	return nr_huge_pages;
2641da177e4SLinus Torvalds }
2651da177e4SLinus Torvalds 
2661da177e4SLinus Torvalds int hugetlb_sysctl_handler(struct ctl_table *table, int write,
2671da177e4SLinus Torvalds 			   struct file *file, void __user *buffer,
2681da177e4SLinus Torvalds 			   size_t *length, loff_t *ppos)
2691da177e4SLinus Torvalds {
2701da177e4SLinus Torvalds 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2711da177e4SLinus Torvalds 	max_huge_pages = set_max_huge_pages(max_huge_pages);
2721da177e4SLinus Torvalds 	return 0;
2731da177e4SLinus Torvalds }
274396faf03SMel Gorman 
275396faf03SMel Gorman int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
276396faf03SMel Gorman 			struct file *file, void __user *buffer,
277396faf03SMel Gorman 			size_t *length, loff_t *ppos)
278396faf03SMel Gorman {
279396faf03SMel Gorman 	proc_dointvec(table, write, file, buffer, length, ppos);
280396faf03SMel Gorman 	if (hugepages_treat_as_movable)
281396faf03SMel Gorman 		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
282396faf03SMel Gorman 	else
283396faf03SMel Gorman 		htlb_alloc_mask = GFP_HIGHUSER;
284396faf03SMel Gorman 	return 0;
285396faf03SMel Gorman }
286396faf03SMel Gorman 
2871da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
2881da177e4SLinus Torvalds 
2891da177e4SLinus Torvalds int hugetlb_report_meminfo(char *buf)
2901da177e4SLinus Torvalds {
2911da177e4SLinus Torvalds 	return sprintf(buf,
2921da177e4SLinus Torvalds 			"HugePages_Total: %5lu\n"
2931da177e4SLinus Torvalds 			"HugePages_Free:  %5lu\n"
294b45b5bd6SDavid Gibson 			"HugePages_Rsvd:  %5lu\n"
2951da177e4SLinus Torvalds 			"Hugepagesize:    %5lu kB\n",
2961da177e4SLinus Torvalds 			nr_huge_pages,
2971da177e4SLinus Torvalds 			free_huge_pages,
298a43a8c39SChen, Kenneth W 			resv_huge_pages,
2991da177e4SLinus Torvalds 			HPAGE_SIZE/1024);
3001da177e4SLinus Torvalds }
3011da177e4SLinus Torvalds 
3021da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
3031da177e4SLinus Torvalds {
3041da177e4SLinus Torvalds 	return sprintf(buf,
3051da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
3061da177e4SLinus Torvalds 		"Node %d HugePages_Free:  %5u\n",
3071da177e4SLinus Torvalds 		nid, nr_huge_pages_node[nid],
3081da177e4SLinus Torvalds 		nid, free_huge_pages_node[nid]);
3091da177e4SLinus Torvalds }
3101da177e4SLinus Torvalds 
3111da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
3121da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
3131da177e4SLinus Torvalds {
3141da177e4SLinus Torvalds 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
3151da177e4SLinus Torvalds }
3161da177e4SLinus Torvalds 
3171da177e4SLinus Torvalds /*
3181da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
3191da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
3201da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
3211da177e4SLinus Torvalds  * this far.
3221da177e4SLinus Torvalds  */
323d0217ac0SNick Piggin static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3241da177e4SLinus Torvalds {
3251da177e4SLinus Torvalds 	BUG();
326d0217ac0SNick Piggin 	return 0;
3271da177e4SLinus Torvalds }
3281da177e4SLinus Torvalds 
3291da177e4SLinus Torvalds struct vm_operations_struct hugetlb_vm_ops = {
330d0217ac0SNick Piggin 	.fault = hugetlb_vm_op_fault,
3311da177e4SLinus Torvalds };
3321da177e4SLinus Torvalds 
3331e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
3341e8f889bSDavid Gibson 				int writable)
33563551ae0SDavid Gibson {
33663551ae0SDavid Gibson 	pte_t entry;
33763551ae0SDavid Gibson 
3381e8f889bSDavid Gibson 	if (writable) {
33963551ae0SDavid Gibson 		entry =
34063551ae0SDavid Gibson 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
34163551ae0SDavid Gibson 	} else {
34263551ae0SDavid Gibson 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
34363551ae0SDavid Gibson 	}
34463551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
34563551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
34663551ae0SDavid Gibson 
34763551ae0SDavid Gibson 	return entry;
34863551ae0SDavid Gibson }
34963551ae0SDavid Gibson 
3501e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
3511e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
3521e8f889bSDavid Gibson {
3531e8f889bSDavid Gibson 	pte_t entry;
3541e8f889bSDavid Gibson 
3551e8f889bSDavid Gibson 	entry = pte_mkwrite(pte_mkdirty(*ptep));
3568dab5241SBenjamin Herrenschmidt 	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
3571e8f889bSDavid Gibson 		update_mmu_cache(vma, address, entry);
3581e8f889bSDavid Gibson 		lazy_mmu_prot_update(entry);
3591e8f889bSDavid Gibson 	}
3608dab5241SBenjamin Herrenschmidt }
3611e8f889bSDavid Gibson 
3621e8f889bSDavid Gibson 
36363551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
36463551ae0SDavid Gibson 			    struct vm_area_struct *vma)
36563551ae0SDavid Gibson {
36663551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
36763551ae0SDavid Gibson 	struct page *ptepage;
3681c59827dSHugh Dickins 	unsigned long addr;
3691e8f889bSDavid Gibson 	int cow;
3701e8f889bSDavid Gibson 
3711e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
37263551ae0SDavid Gibson 
3731c59827dSHugh Dickins 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
374c74df32cSHugh Dickins 		src_pte = huge_pte_offset(src, addr);
375c74df32cSHugh Dickins 		if (!src_pte)
376c74df32cSHugh Dickins 			continue;
37763551ae0SDavid Gibson 		dst_pte = huge_pte_alloc(dst, addr);
37863551ae0SDavid Gibson 		if (!dst_pte)
37963551ae0SDavid Gibson 			goto nomem;
380c74df32cSHugh Dickins 		spin_lock(&dst->page_table_lock);
3811c59827dSHugh Dickins 		spin_lock(&src->page_table_lock);
382c74df32cSHugh Dickins 		if (!pte_none(*src_pte)) {
3831e8f889bSDavid Gibson 			if (cow)
3841e8f889bSDavid Gibson 				ptep_set_wrprotect(src, addr, src_pte);
38563551ae0SDavid Gibson 			entry = *src_pte;
38663551ae0SDavid Gibson 			ptepage = pte_page(entry);
38763551ae0SDavid Gibson 			get_page(ptepage);
38863551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
3891c59827dSHugh Dickins 		}
3901c59827dSHugh Dickins 		spin_unlock(&src->page_table_lock);
391c74df32cSHugh Dickins 		spin_unlock(&dst->page_table_lock);
39263551ae0SDavid Gibson 	}
39363551ae0SDavid Gibson 	return 0;
39463551ae0SDavid Gibson 
39563551ae0SDavid Gibson nomem:
39663551ae0SDavid Gibson 	return -ENOMEM;
39763551ae0SDavid Gibson }
39863551ae0SDavid Gibson 
399502717f4SChen, Kenneth W void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
40063551ae0SDavid Gibson 			    unsigned long end)
40163551ae0SDavid Gibson {
40263551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
40363551ae0SDavid Gibson 	unsigned long address;
404c7546f8fSDavid Gibson 	pte_t *ptep;
40563551ae0SDavid Gibson 	pte_t pte;
40663551ae0SDavid Gibson 	struct page *page;
407fe1668aeSChen, Kenneth W 	struct page *tmp;
408c0a499c2SChen, Kenneth W 	/*
409c0a499c2SChen, Kenneth W 	 * A page gathering list, protected by per file i_mmap_lock. The
410c0a499c2SChen, Kenneth W 	 * lock is used to avoid list corruption from multiple unmapping
411c0a499c2SChen, Kenneth W 	 * of the same page since we are using page->lru.
412c0a499c2SChen, Kenneth W 	 */
413fe1668aeSChen, Kenneth W 	LIST_HEAD(page_list);
41463551ae0SDavid Gibson 
41563551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
41663551ae0SDavid Gibson 	BUG_ON(start & ~HPAGE_MASK);
41763551ae0SDavid Gibson 	BUG_ON(end & ~HPAGE_MASK);
41863551ae0SDavid Gibson 
419508034a3SHugh Dickins 	spin_lock(&mm->page_table_lock);
42063551ae0SDavid Gibson 	for (address = start; address < end; address += HPAGE_SIZE) {
421c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
422c7546f8fSDavid Gibson 		if (!ptep)
423c7546f8fSDavid Gibson 			continue;
424c7546f8fSDavid Gibson 
42539dde65cSChen, Kenneth W 		if (huge_pmd_unshare(mm, &address, ptep))
42639dde65cSChen, Kenneth W 			continue;
42739dde65cSChen, Kenneth W 
428c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
42963551ae0SDavid Gibson 		if (pte_none(pte))
43063551ae0SDavid Gibson 			continue;
431c7546f8fSDavid Gibson 
43263551ae0SDavid Gibson 		page = pte_page(pte);
4336649a386SKen Chen 		if (pte_dirty(pte))
4346649a386SKen Chen 			set_page_dirty(page);
435fe1668aeSChen, Kenneth W 		list_add(&page->lru, &page_list);
43663551ae0SDavid Gibson 	}
4371da177e4SLinus Torvalds 	spin_unlock(&mm->page_table_lock);
438508034a3SHugh Dickins 	flush_tlb_range(vma, start, end);
439fe1668aeSChen, Kenneth W 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
440fe1668aeSChen, Kenneth W 		list_del(&page->lru);
441fe1668aeSChen, Kenneth W 		put_page(page);
442fe1668aeSChen, Kenneth W 	}
4431da177e4SLinus Torvalds }
44463551ae0SDavid Gibson 
445502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
446502717f4SChen, Kenneth W 			  unsigned long end)
447502717f4SChen, Kenneth W {
448502717f4SChen, Kenneth W 	/*
449502717f4SChen, Kenneth W 	 * It is undesirable to test vma->vm_file as it should be non-null
450502717f4SChen, Kenneth W 	 * for valid hugetlb area. However, vm_file will be NULL in the error
451502717f4SChen, Kenneth W 	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
452502717f4SChen, Kenneth W 	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
453502717f4SChen, Kenneth W 	 * to clean up. Since no pte has actually been setup, it is safe to
454502717f4SChen, Kenneth W 	 * do nothing in this case.
455502717f4SChen, Kenneth W 	 */
456502717f4SChen, Kenneth W 	if (vma->vm_file) {
457502717f4SChen, Kenneth W 		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
458502717f4SChen, Kenneth W 		__unmap_hugepage_range(vma, start, end);
459502717f4SChen, Kenneth W 		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
460502717f4SChen, Kenneth W 	}
461502717f4SChen, Kenneth W }
462502717f4SChen, Kenneth W 
4631e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
4641e8f889bSDavid Gibson 			unsigned long address, pte_t *ptep, pte_t pte)
4651e8f889bSDavid Gibson {
4661e8f889bSDavid Gibson 	struct page *old_page, *new_page;
46779ac6ba4SDavid Gibson 	int avoidcopy;
4681e8f889bSDavid Gibson 
4691e8f889bSDavid Gibson 	old_page = pte_page(pte);
4701e8f889bSDavid Gibson 
4711e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
4721e8f889bSDavid Gibson 	 * and just make the page writable */
4731e8f889bSDavid Gibson 	avoidcopy = (page_count(old_page) == 1);
4741e8f889bSDavid Gibson 	if (avoidcopy) {
4751e8f889bSDavid Gibson 		set_huge_ptep_writable(vma, address, ptep);
47683c54070SNick Piggin 		return 0;
4771e8f889bSDavid Gibson 	}
4781e8f889bSDavid Gibson 
4791e8f889bSDavid Gibson 	page_cache_get(old_page);
4805da7ca86SChristoph Lameter 	new_page = alloc_huge_page(vma, address);
4811e8f889bSDavid Gibson 
4821e8f889bSDavid Gibson 	if (!new_page) {
4831e8f889bSDavid Gibson 		page_cache_release(old_page);
4840df420d8SChristoph Lameter 		return VM_FAULT_OOM;
4851e8f889bSDavid Gibson 	}
4861e8f889bSDavid Gibson 
4871e8f889bSDavid Gibson 	spin_unlock(&mm->page_table_lock);
4889de455b2SAtsushi Nemoto 	copy_huge_page(new_page, old_page, address, vma);
4891e8f889bSDavid Gibson 	spin_lock(&mm->page_table_lock);
4901e8f889bSDavid Gibson 
4911e8f889bSDavid Gibson 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
4921e8f889bSDavid Gibson 	if (likely(pte_same(*ptep, pte))) {
4931e8f889bSDavid Gibson 		/* Break COW */
4941e8f889bSDavid Gibson 		set_huge_pte_at(mm, address, ptep,
4951e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
4961e8f889bSDavid Gibson 		/* Make the old page be freed below */
4971e8f889bSDavid Gibson 		new_page = old_page;
4981e8f889bSDavid Gibson 	}
4991e8f889bSDavid Gibson 	page_cache_release(new_page);
5001e8f889bSDavid Gibson 	page_cache_release(old_page);
50183c54070SNick Piggin 	return 0;
5021e8f889bSDavid Gibson }
5031e8f889bSDavid Gibson 
504a1ed3ddaSRobert P. J. Day static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
5051e8f889bSDavid Gibson 			unsigned long address, pte_t *ptep, int write_access)
506ac9b9c66SHugh Dickins {
507ac9b9c66SHugh Dickins 	int ret = VM_FAULT_SIGBUS;
5084c887265SAdam Litke 	unsigned long idx;
5094c887265SAdam Litke 	unsigned long size;
5104c887265SAdam Litke 	struct page *page;
5114c887265SAdam Litke 	struct address_space *mapping;
5121e8f889bSDavid Gibson 	pte_t new_pte;
5134c887265SAdam Litke 
5144c887265SAdam Litke 	mapping = vma->vm_file->f_mapping;
5154c887265SAdam Litke 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
5164c887265SAdam Litke 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
5174c887265SAdam Litke 
5184c887265SAdam Litke 	/*
5194c887265SAdam Litke 	 * Use page lock to guard against racing truncation
5204c887265SAdam Litke 	 * before we get page_table_lock.
5214c887265SAdam Litke 	 */
5226bda666aSChristoph Lameter retry:
5236bda666aSChristoph Lameter 	page = find_lock_page(mapping, idx);
5246bda666aSChristoph Lameter 	if (!page) {
525ebed4bfcSHugh Dickins 		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
526ebed4bfcSHugh Dickins 		if (idx >= size)
527ebed4bfcSHugh Dickins 			goto out;
5286bda666aSChristoph Lameter 		if (hugetlb_get_quota(mapping))
5294c887265SAdam Litke 			goto out;
5306bda666aSChristoph Lameter 		page = alloc_huge_page(vma, address);
5316bda666aSChristoph Lameter 		if (!page) {
5326bda666aSChristoph Lameter 			hugetlb_put_quota(mapping);
5330df420d8SChristoph Lameter 			ret = VM_FAULT_OOM;
5346bda666aSChristoph Lameter 			goto out;
5356bda666aSChristoph Lameter 		}
53679ac6ba4SDavid Gibson 		clear_huge_page(page, address);
537ac9b9c66SHugh Dickins 
5386bda666aSChristoph Lameter 		if (vma->vm_flags & VM_SHARED) {
5396bda666aSChristoph Lameter 			int err;
5406bda666aSChristoph Lameter 
5416bda666aSChristoph Lameter 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
5426bda666aSChristoph Lameter 			if (err) {
5436bda666aSChristoph Lameter 				put_page(page);
5446bda666aSChristoph Lameter 				hugetlb_put_quota(mapping);
5456bda666aSChristoph Lameter 				if (err == -EEXIST)
5466bda666aSChristoph Lameter 					goto retry;
5476bda666aSChristoph Lameter 				goto out;
5486bda666aSChristoph Lameter 			}
5496bda666aSChristoph Lameter 		} else
5506bda666aSChristoph Lameter 			lock_page(page);
5516bda666aSChristoph Lameter 	}
5521e8f889bSDavid Gibson 
553ac9b9c66SHugh Dickins 	spin_lock(&mm->page_table_lock);
5544c887265SAdam Litke 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
5554c887265SAdam Litke 	if (idx >= size)
5564c887265SAdam Litke 		goto backout;
5574c887265SAdam Litke 
55883c54070SNick Piggin 	ret = 0;
55986e5216fSAdam Litke 	if (!pte_none(*ptep))
5604c887265SAdam Litke 		goto backout;
5614c887265SAdam Litke 
5621e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
5631e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
5641e8f889bSDavid Gibson 	set_huge_pte_at(mm, address, ptep, new_pte);
5651e8f889bSDavid Gibson 
5661e8f889bSDavid Gibson 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
5671e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
5681e8f889bSDavid Gibson 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
5691e8f889bSDavid Gibson 	}
5701e8f889bSDavid Gibson 
571ac9b9c66SHugh Dickins 	spin_unlock(&mm->page_table_lock);
5724c887265SAdam Litke 	unlock_page(page);
5734c887265SAdam Litke out:
574ac9b9c66SHugh Dickins 	return ret;
5754c887265SAdam Litke 
5764c887265SAdam Litke backout:
5774c887265SAdam Litke 	spin_unlock(&mm->page_table_lock);
5784c887265SAdam Litke 	hugetlb_put_quota(mapping);
5794c887265SAdam Litke 	unlock_page(page);
5804c887265SAdam Litke 	put_page(page);
5814c887265SAdam Litke 	goto out;
582ac9b9c66SHugh Dickins }
583ac9b9c66SHugh Dickins 
58486e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
58586e5216fSAdam Litke 			unsigned long address, int write_access)
58686e5216fSAdam Litke {
58786e5216fSAdam Litke 	pte_t *ptep;
58886e5216fSAdam Litke 	pte_t entry;
5891e8f889bSDavid Gibson 	int ret;
5903935baa9SDavid Gibson 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
59186e5216fSAdam Litke 
59286e5216fSAdam Litke 	ptep = huge_pte_alloc(mm, address);
59386e5216fSAdam Litke 	if (!ptep)
59486e5216fSAdam Litke 		return VM_FAULT_OOM;
59586e5216fSAdam Litke 
5963935baa9SDavid Gibson 	/*
5973935baa9SDavid Gibson 	 * Serialize hugepage allocation and instantiation, so that we don't
5983935baa9SDavid Gibson 	 * get spurious allocation failures if two CPUs race to instantiate
5993935baa9SDavid Gibson 	 * the same page in the page cache.
6003935baa9SDavid Gibson 	 */
6013935baa9SDavid Gibson 	mutex_lock(&hugetlb_instantiation_mutex);
60286e5216fSAdam Litke 	entry = *ptep;
6033935baa9SDavid Gibson 	if (pte_none(entry)) {
6043935baa9SDavid Gibson 		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
6053935baa9SDavid Gibson 		mutex_unlock(&hugetlb_instantiation_mutex);
6063935baa9SDavid Gibson 		return ret;
6073935baa9SDavid Gibson 	}
60886e5216fSAdam Litke 
60983c54070SNick Piggin 	ret = 0;
6101e8f889bSDavid Gibson 
6111e8f889bSDavid Gibson 	spin_lock(&mm->page_table_lock);
6121e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
6131e8f889bSDavid Gibson 	if (likely(pte_same(entry, *ptep)))
6141e8f889bSDavid Gibson 		if (write_access && !pte_write(entry))
6151e8f889bSDavid Gibson 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
6161e8f889bSDavid Gibson 	spin_unlock(&mm->page_table_lock);
6173935baa9SDavid Gibson 	mutex_unlock(&hugetlb_instantiation_mutex);
6181e8f889bSDavid Gibson 
6191e8f889bSDavid Gibson 	return ret;
62086e5216fSAdam Litke }
62186e5216fSAdam Litke 
62263551ae0SDavid Gibson int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
62363551ae0SDavid Gibson 			struct page **pages, struct vm_area_struct **vmas,
62463551ae0SDavid Gibson 			unsigned long *position, int *length, int i)
62563551ae0SDavid Gibson {
626d5d4b0aaSChen, Kenneth W 	unsigned long pfn_offset;
627d5d4b0aaSChen, Kenneth W 	unsigned long vaddr = *position;
62863551ae0SDavid Gibson 	int remainder = *length;
62963551ae0SDavid Gibson 
6301c59827dSHugh Dickins 	spin_lock(&mm->page_table_lock);
63163551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
63263551ae0SDavid Gibson 		pte_t *pte;
63363551ae0SDavid Gibson 		struct page *page;
63463551ae0SDavid Gibson 
6354c887265SAdam Litke 		/*
6364c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
6374c887265SAdam Litke 		 * each hugepage.  We have to make * sure we get the
6384c887265SAdam Litke 		 * first, for the page indexing below to work.
6394c887265SAdam Litke 		 */
64063551ae0SDavid Gibson 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
64163551ae0SDavid Gibson 
6421c59827dSHugh Dickins 		if (!pte || pte_none(*pte)) {
6434c887265SAdam Litke 			int ret;
6444c887265SAdam Litke 
6454c887265SAdam Litke 			spin_unlock(&mm->page_table_lock);
6464c887265SAdam Litke 			ret = hugetlb_fault(mm, vma, vaddr, 0);
6474c887265SAdam Litke 			spin_lock(&mm->page_table_lock);
648a89182c7SAdam Litke 			if (!(ret & VM_FAULT_ERROR))
6494c887265SAdam Litke 				continue;
6504c887265SAdam Litke 
6511c59827dSHugh Dickins 			remainder = 0;
6521c59827dSHugh Dickins 			if (!i)
6531c59827dSHugh Dickins 				i = -EFAULT;
6541c59827dSHugh Dickins 			break;
6551c59827dSHugh Dickins 		}
65663551ae0SDavid Gibson 
657d5d4b0aaSChen, Kenneth W 		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
658d5d4b0aaSChen, Kenneth W 		page = pte_page(*pte);
659d5d4b0aaSChen, Kenneth W same_page:
660d6692183SChen, Kenneth W 		if (pages) {
66163551ae0SDavid Gibson 			get_page(page);
662d5d4b0aaSChen, Kenneth W 			pages[i] = page + pfn_offset;
663d6692183SChen, Kenneth W 		}
66463551ae0SDavid Gibson 
66563551ae0SDavid Gibson 		if (vmas)
66663551ae0SDavid Gibson 			vmas[i] = vma;
66763551ae0SDavid Gibson 
66863551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
669d5d4b0aaSChen, Kenneth W 		++pfn_offset;
67063551ae0SDavid Gibson 		--remainder;
67163551ae0SDavid Gibson 		++i;
672d5d4b0aaSChen, Kenneth W 		if (vaddr < vma->vm_end && remainder &&
673d5d4b0aaSChen, Kenneth W 				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
674d5d4b0aaSChen, Kenneth W 			/*
675d5d4b0aaSChen, Kenneth W 			 * We use pfn_offset to avoid touching the pageframes
676d5d4b0aaSChen, Kenneth W 			 * of this compound page.
677d5d4b0aaSChen, Kenneth W 			 */
678d5d4b0aaSChen, Kenneth W 			goto same_page;
679d5d4b0aaSChen, Kenneth W 		}
68063551ae0SDavid Gibson 	}
6811c59827dSHugh Dickins 	spin_unlock(&mm->page_table_lock);
68263551ae0SDavid Gibson 	*length = remainder;
68363551ae0SDavid Gibson 	*position = vaddr;
68463551ae0SDavid Gibson 
68563551ae0SDavid Gibson 	return i;
68663551ae0SDavid Gibson }
6878f860591SZhang, Yanmin 
6888f860591SZhang, Yanmin void hugetlb_change_protection(struct vm_area_struct *vma,
6898f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
6908f860591SZhang, Yanmin {
6918f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
6928f860591SZhang, Yanmin 	unsigned long start = address;
6938f860591SZhang, Yanmin 	pte_t *ptep;
6948f860591SZhang, Yanmin 	pte_t pte;
6958f860591SZhang, Yanmin 
6968f860591SZhang, Yanmin 	BUG_ON(address >= end);
6978f860591SZhang, Yanmin 	flush_cache_range(vma, address, end);
6988f860591SZhang, Yanmin 
69939dde65cSChen, Kenneth W 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
7008f860591SZhang, Yanmin 	spin_lock(&mm->page_table_lock);
7018f860591SZhang, Yanmin 	for (; address < end; address += HPAGE_SIZE) {
7028f860591SZhang, Yanmin 		ptep = huge_pte_offset(mm, address);
7038f860591SZhang, Yanmin 		if (!ptep)
7048f860591SZhang, Yanmin 			continue;
70539dde65cSChen, Kenneth W 		if (huge_pmd_unshare(mm, &address, ptep))
70639dde65cSChen, Kenneth W 			continue;
7078f860591SZhang, Yanmin 		if (!pte_none(*ptep)) {
7088f860591SZhang, Yanmin 			pte = huge_ptep_get_and_clear(mm, address, ptep);
7098f860591SZhang, Yanmin 			pte = pte_mkhuge(pte_modify(pte, newprot));
7108f860591SZhang, Yanmin 			set_huge_pte_at(mm, address, ptep, pte);
7118f860591SZhang, Yanmin 			lazy_mmu_prot_update(pte);
7128f860591SZhang, Yanmin 		}
7138f860591SZhang, Yanmin 	}
7148f860591SZhang, Yanmin 	spin_unlock(&mm->page_table_lock);
71539dde65cSChen, Kenneth W 	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
7168f860591SZhang, Yanmin 
7178f860591SZhang, Yanmin 	flush_tlb_range(vma, start, end);
7188f860591SZhang, Yanmin }
7198f860591SZhang, Yanmin 
720a43a8c39SChen, Kenneth W struct file_region {
721a43a8c39SChen, Kenneth W 	struct list_head link;
722a43a8c39SChen, Kenneth W 	long from;
723a43a8c39SChen, Kenneth W 	long to;
724a43a8c39SChen, Kenneth W };
725a43a8c39SChen, Kenneth W 
726a43a8c39SChen, Kenneth W static long region_add(struct list_head *head, long f, long t)
727a43a8c39SChen, Kenneth W {
728a43a8c39SChen, Kenneth W 	struct file_region *rg, *nrg, *trg;
729a43a8c39SChen, Kenneth W 
730a43a8c39SChen, Kenneth W 	/* Locate the region we are either in or before. */
731a43a8c39SChen, Kenneth W 	list_for_each_entry(rg, head, link)
732a43a8c39SChen, Kenneth W 		if (f <= rg->to)
733a43a8c39SChen, Kenneth W 			break;
734a43a8c39SChen, Kenneth W 
735a43a8c39SChen, Kenneth W 	/* Round our left edge to the current segment if it encloses us. */
736a43a8c39SChen, Kenneth W 	if (f > rg->from)
737a43a8c39SChen, Kenneth W 		f = rg->from;
738a43a8c39SChen, Kenneth W 
739a43a8c39SChen, Kenneth W 	/* Check for and consume any regions we now overlap with. */
740a43a8c39SChen, Kenneth W 	nrg = rg;
741a43a8c39SChen, Kenneth W 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
742a43a8c39SChen, Kenneth W 		if (&rg->link == head)
743a43a8c39SChen, Kenneth W 			break;
744a43a8c39SChen, Kenneth W 		if (rg->from > t)
745a43a8c39SChen, Kenneth W 			break;
746a43a8c39SChen, Kenneth W 
747a43a8c39SChen, Kenneth W 		/* If this area reaches higher then extend our area to
748a43a8c39SChen, Kenneth W 		 * include it completely.  If this is not the first area
749a43a8c39SChen, Kenneth W 		 * which we intend to reuse, free it. */
750a43a8c39SChen, Kenneth W 		if (rg->to > t)
751a43a8c39SChen, Kenneth W 			t = rg->to;
752a43a8c39SChen, Kenneth W 		if (rg != nrg) {
753a43a8c39SChen, Kenneth W 			list_del(&rg->link);
754a43a8c39SChen, Kenneth W 			kfree(rg);
755a43a8c39SChen, Kenneth W 		}
756a43a8c39SChen, Kenneth W 	}
757a43a8c39SChen, Kenneth W 	nrg->from = f;
758a43a8c39SChen, Kenneth W 	nrg->to = t;
759a43a8c39SChen, Kenneth W 	return 0;
760a43a8c39SChen, Kenneth W }
761a43a8c39SChen, Kenneth W 
762a43a8c39SChen, Kenneth W static long region_chg(struct list_head *head, long f, long t)
763a43a8c39SChen, Kenneth W {
764a43a8c39SChen, Kenneth W 	struct file_region *rg, *nrg;
765a43a8c39SChen, Kenneth W 	long chg = 0;
766a43a8c39SChen, Kenneth W 
767a43a8c39SChen, Kenneth W 	/* Locate the region we are before or in. */
768a43a8c39SChen, Kenneth W 	list_for_each_entry(rg, head, link)
769a43a8c39SChen, Kenneth W 		if (f <= rg->to)
770a43a8c39SChen, Kenneth W 			break;
771a43a8c39SChen, Kenneth W 
772a43a8c39SChen, Kenneth W 	/* If we are below the current region then a new region is required.
773a43a8c39SChen, Kenneth W 	 * Subtle, allocate a new region at the position but make it zero
774a43a8c39SChen, Kenneth W 	 * size such that we can guarentee to record the reservation. */
775a43a8c39SChen, Kenneth W 	if (&rg->link == head || t < rg->from) {
776a43a8c39SChen, Kenneth W 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
777a43a8c39SChen, Kenneth W 		if (nrg == 0)
778a43a8c39SChen, Kenneth W 			return -ENOMEM;
779a43a8c39SChen, Kenneth W 		nrg->from = f;
780a43a8c39SChen, Kenneth W 		nrg->to   = f;
781a43a8c39SChen, Kenneth W 		INIT_LIST_HEAD(&nrg->link);
782a43a8c39SChen, Kenneth W 		list_add(&nrg->link, rg->link.prev);
783a43a8c39SChen, Kenneth W 
784a43a8c39SChen, Kenneth W 		return t - f;
785a43a8c39SChen, Kenneth W 	}
786a43a8c39SChen, Kenneth W 
787a43a8c39SChen, Kenneth W 	/* Round our left edge to the current segment if it encloses us. */
788a43a8c39SChen, Kenneth W 	if (f > rg->from)
789a43a8c39SChen, Kenneth W 		f = rg->from;
790a43a8c39SChen, Kenneth W 	chg = t - f;
791a43a8c39SChen, Kenneth W 
792a43a8c39SChen, Kenneth W 	/* Check for and consume any regions we now overlap with. */
793a43a8c39SChen, Kenneth W 	list_for_each_entry(rg, rg->link.prev, link) {
794a43a8c39SChen, Kenneth W 		if (&rg->link == head)
795a43a8c39SChen, Kenneth W 			break;
796a43a8c39SChen, Kenneth W 		if (rg->from > t)
797a43a8c39SChen, Kenneth W 			return chg;
798a43a8c39SChen, Kenneth W 
799a43a8c39SChen, Kenneth W 		/* We overlap with this area, if it extends futher than
800a43a8c39SChen, Kenneth W 		 * us then we must extend ourselves.  Account for its
801a43a8c39SChen, Kenneth W 		 * existing reservation. */
802a43a8c39SChen, Kenneth W 		if (rg->to > t) {
803a43a8c39SChen, Kenneth W 			chg += rg->to - t;
804a43a8c39SChen, Kenneth W 			t = rg->to;
805a43a8c39SChen, Kenneth W 		}
806a43a8c39SChen, Kenneth W 		chg -= rg->to - rg->from;
807a43a8c39SChen, Kenneth W 	}
808a43a8c39SChen, Kenneth W 	return chg;
809a43a8c39SChen, Kenneth W }
810a43a8c39SChen, Kenneth W 
811a43a8c39SChen, Kenneth W static long region_truncate(struct list_head *head, long end)
812a43a8c39SChen, Kenneth W {
813a43a8c39SChen, Kenneth W 	struct file_region *rg, *trg;
814a43a8c39SChen, Kenneth W 	long chg = 0;
815a43a8c39SChen, Kenneth W 
816a43a8c39SChen, Kenneth W 	/* Locate the region we are either in or before. */
817a43a8c39SChen, Kenneth W 	list_for_each_entry(rg, head, link)
818a43a8c39SChen, Kenneth W 		if (end <= rg->to)
819a43a8c39SChen, Kenneth W 			break;
820a43a8c39SChen, Kenneth W 	if (&rg->link == head)
821a43a8c39SChen, Kenneth W 		return 0;
822a43a8c39SChen, Kenneth W 
823a43a8c39SChen, Kenneth W 	/* If we are in the middle of a region then adjust it. */
824a43a8c39SChen, Kenneth W 	if (end > rg->from) {
825a43a8c39SChen, Kenneth W 		chg = rg->to - end;
826a43a8c39SChen, Kenneth W 		rg->to = end;
827a43a8c39SChen, Kenneth W 		rg = list_entry(rg->link.next, typeof(*rg), link);
828a43a8c39SChen, Kenneth W 	}
829a43a8c39SChen, Kenneth W 
830a43a8c39SChen, Kenneth W 	/* Drop any remaining regions. */
831a43a8c39SChen, Kenneth W 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
832a43a8c39SChen, Kenneth W 		if (&rg->link == head)
833a43a8c39SChen, Kenneth W 			break;
834a43a8c39SChen, Kenneth W 		chg += rg->to - rg->from;
835a43a8c39SChen, Kenneth W 		list_del(&rg->link);
836a43a8c39SChen, Kenneth W 		kfree(rg);
837a43a8c39SChen, Kenneth W 	}
838a43a8c39SChen, Kenneth W 	return chg;
839a43a8c39SChen, Kenneth W }
840a43a8c39SChen, Kenneth W 
841a43a8c39SChen, Kenneth W static int hugetlb_acct_memory(long delta)
842a43a8c39SChen, Kenneth W {
843a43a8c39SChen, Kenneth W 	int ret = -ENOMEM;
844a43a8c39SChen, Kenneth W 
845a43a8c39SChen, Kenneth W 	spin_lock(&hugetlb_lock);
846a43a8c39SChen, Kenneth W 	if ((delta + resv_huge_pages) <= free_huge_pages) {
847a43a8c39SChen, Kenneth W 		resv_huge_pages += delta;
848a43a8c39SChen, Kenneth W 		ret = 0;
849a43a8c39SChen, Kenneth W 	}
850a43a8c39SChen, Kenneth W 	spin_unlock(&hugetlb_lock);
851a43a8c39SChen, Kenneth W 	return ret;
852a43a8c39SChen, Kenneth W }
853a43a8c39SChen, Kenneth W 
854a43a8c39SChen, Kenneth W int hugetlb_reserve_pages(struct inode *inode, long from, long to)
855a43a8c39SChen, Kenneth W {
856a43a8c39SChen, Kenneth W 	long ret, chg;
857a43a8c39SChen, Kenneth W 
858a43a8c39SChen, Kenneth W 	chg = region_chg(&inode->i_mapping->private_list, from, to);
859a43a8c39SChen, Kenneth W 	if (chg < 0)
860a43a8c39SChen, Kenneth W 		return chg;
8618a630112SKen Chen 	/*
8628a630112SKen Chen 	 * When cpuset is configured, it breaks the strict hugetlb page
8638a630112SKen Chen 	 * reservation as the accounting is done on a global variable. Such
8648a630112SKen Chen 	 * reservation is completely rubbish in the presence of cpuset because
8658a630112SKen Chen 	 * the reservation is not checked against page availability for the
8668a630112SKen Chen 	 * current cpuset. Application can still potentially OOM'ed by kernel
8678a630112SKen Chen 	 * with lack of free htlb page in cpuset that the task is in.
8688a630112SKen Chen 	 * Attempt to enforce strict accounting with cpuset is almost
8698a630112SKen Chen 	 * impossible (or too ugly) because cpuset is too fluid that
8708a630112SKen Chen 	 * task or memory node can be dynamically moved between cpusets.
8718a630112SKen Chen 	 *
8728a630112SKen Chen 	 * The change of semantics for shared hugetlb mapping with cpuset is
8738a630112SKen Chen 	 * undesirable. However, in order to preserve some of the semantics,
8748a630112SKen Chen 	 * we fall back to check against current free page availability as
8758a630112SKen Chen 	 * a best attempt and hopefully to minimize the impact of changing
8768a630112SKen Chen 	 * semantics that cpuset has.
8778a630112SKen Chen 	 */
8788a630112SKen Chen 	if (chg > cpuset_mems_nr(free_huge_pages_node))
8798a630112SKen Chen 		return -ENOMEM;
8808a630112SKen Chen 
881a43a8c39SChen, Kenneth W 	ret = hugetlb_acct_memory(chg);
882a43a8c39SChen, Kenneth W 	if (ret < 0)
883a43a8c39SChen, Kenneth W 		return ret;
884a43a8c39SChen, Kenneth W 	region_add(&inode->i_mapping->private_list, from, to);
885a43a8c39SChen, Kenneth W 	return 0;
886a43a8c39SChen, Kenneth W }
887a43a8c39SChen, Kenneth W 
888a43a8c39SChen, Kenneth W void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
889a43a8c39SChen, Kenneth W {
890a43a8c39SChen, Kenneth W 	long chg = region_truncate(&inode->i_mapping->private_list, offset);
891a43a8c39SChen, Kenneth W 	hugetlb_acct_memory(freed - chg);
892a43a8c39SChen, Kenneth W }
893