xref: /openbmc/linux/mm/hugetlb.c (revision c7546f8f)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
31da177e4SLinus Torvalds  * (C) William Irwin, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/gfp.h>
61da177e4SLinus Torvalds #include <linux/list.h>
71da177e4SLinus Torvalds #include <linux/init.h>
81da177e4SLinus Torvalds #include <linux/module.h>
91da177e4SLinus Torvalds #include <linux/mm.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
121da177e4SLinus Torvalds #include <linux/nodemask.h>
1363551ae0SDavid Gibson #include <linux/pagemap.h>
1463551ae0SDavid Gibson #include <asm/page.h>
1563551ae0SDavid Gibson #include <asm/pgtable.h>
1663551ae0SDavid Gibson 
1763551ae0SDavid Gibson #include <linux/hugetlb.h>
181da177e4SLinus Torvalds 
191da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
201da177e4SLinus Torvalds static unsigned long nr_huge_pages, free_huge_pages;
211da177e4SLinus Torvalds unsigned long max_huge_pages;
221da177e4SLinus Torvalds static struct list_head hugepage_freelists[MAX_NUMNODES];
231da177e4SLinus Torvalds static unsigned int nr_huge_pages_node[MAX_NUMNODES];
241da177e4SLinus Torvalds static unsigned int free_huge_pages_node[MAX_NUMNODES];
251da177e4SLinus Torvalds static DEFINE_SPINLOCK(hugetlb_lock);
261da177e4SLinus Torvalds 
271da177e4SLinus Torvalds static void enqueue_huge_page(struct page *page)
281da177e4SLinus Torvalds {
291da177e4SLinus Torvalds 	int nid = page_to_nid(page);
301da177e4SLinus Torvalds 	list_add(&page->lru, &hugepage_freelists[nid]);
311da177e4SLinus Torvalds 	free_huge_pages++;
321da177e4SLinus Torvalds 	free_huge_pages_node[nid]++;
331da177e4SLinus Torvalds }
341da177e4SLinus Torvalds 
351da177e4SLinus Torvalds static struct page *dequeue_huge_page(void)
361da177e4SLinus Torvalds {
371da177e4SLinus Torvalds 	int nid = numa_node_id();
381da177e4SLinus Torvalds 	struct page *page = NULL;
391da177e4SLinus Torvalds 
401da177e4SLinus Torvalds 	if (list_empty(&hugepage_freelists[nid])) {
411da177e4SLinus Torvalds 		for (nid = 0; nid < MAX_NUMNODES; ++nid)
421da177e4SLinus Torvalds 			if (!list_empty(&hugepage_freelists[nid]))
431da177e4SLinus Torvalds 				break;
441da177e4SLinus Torvalds 	}
451da177e4SLinus Torvalds 	if (nid >= 0 && nid < MAX_NUMNODES &&
461da177e4SLinus Torvalds 	    !list_empty(&hugepage_freelists[nid])) {
471da177e4SLinus Torvalds 		page = list_entry(hugepage_freelists[nid].next,
481da177e4SLinus Torvalds 				  struct page, lru);
491da177e4SLinus Torvalds 		list_del(&page->lru);
501da177e4SLinus Torvalds 		free_huge_pages--;
511da177e4SLinus Torvalds 		free_huge_pages_node[nid]--;
521da177e4SLinus Torvalds 	}
531da177e4SLinus Torvalds 	return page;
541da177e4SLinus Torvalds }
551da177e4SLinus Torvalds 
561da177e4SLinus Torvalds static struct page *alloc_fresh_huge_page(void)
571da177e4SLinus Torvalds {
581da177e4SLinus Torvalds 	static int nid = 0;
591da177e4SLinus Torvalds 	struct page *page;
601da177e4SLinus Torvalds 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
611da177e4SLinus Torvalds 					HUGETLB_PAGE_ORDER);
621da177e4SLinus Torvalds 	nid = (nid + 1) % num_online_nodes();
631da177e4SLinus Torvalds 	if (page) {
641da177e4SLinus Torvalds 		nr_huge_pages++;
651da177e4SLinus Torvalds 		nr_huge_pages_node[page_to_nid(page)]++;
661da177e4SLinus Torvalds 	}
671da177e4SLinus Torvalds 	return page;
681da177e4SLinus Torvalds }
691da177e4SLinus Torvalds 
701da177e4SLinus Torvalds void free_huge_page(struct page *page)
711da177e4SLinus Torvalds {
721da177e4SLinus Torvalds 	BUG_ON(page_count(page));
731da177e4SLinus Torvalds 
741da177e4SLinus Torvalds 	INIT_LIST_HEAD(&page->lru);
751da177e4SLinus Torvalds 	page[1].mapping = NULL;
761da177e4SLinus Torvalds 
771da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
781da177e4SLinus Torvalds 	enqueue_huge_page(page);
791da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
801da177e4SLinus Torvalds }
811da177e4SLinus Torvalds 
821da177e4SLinus Torvalds struct page *alloc_huge_page(void)
831da177e4SLinus Torvalds {
841da177e4SLinus Torvalds 	struct page *page;
851da177e4SLinus Torvalds 	int i;
861da177e4SLinus Torvalds 
871da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
881da177e4SLinus Torvalds 	page = dequeue_huge_page();
891da177e4SLinus Torvalds 	if (!page) {
901da177e4SLinus Torvalds 		spin_unlock(&hugetlb_lock);
911da177e4SLinus Torvalds 		return NULL;
921da177e4SLinus Torvalds 	}
931da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
941da177e4SLinus Torvalds 	set_page_count(page, 1);
951da177e4SLinus Torvalds 	page[1].mapping = (void *)free_huge_page;
961da177e4SLinus Torvalds 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
971da177e4SLinus Torvalds 		clear_highpage(&page[i]);
981da177e4SLinus Torvalds 	return page;
991da177e4SLinus Torvalds }
1001da177e4SLinus Torvalds 
1011da177e4SLinus Torvalds static int __init hugetlb_init(void)
1021da177e4SLinus Torvalds {
1031da177e4SLinus Torvalds 	unsigned long i;
1041da177e4SLinus Torvalds 	struct page *page;
1051da177e4SLinus Torvalds 
1061da177e4SLinus Torvalds 	for (i = 0; i < MAX_NUMNODES; ++i)
1071da177e4SLinus Torvalds 		INIT_LIST_HEAD(&hugepage_freelists[i]);
1081da177e4SLinus Torvalds 
1091da177e4SLinus Torvalds 	for (i = 0; i < max_huge_pages; ++i) {
1101da177e4SLinus Torvalds 		page = alloc_fresh_huge_page();
1111da177e4SLinus Torvalds 		if (!page)
1121da177e4SLinus Torvalds 			break;
1131da177e4SLinus Torvalds 		spin_lock(&hugetlb_lock);
1141da177e4SLinus Torvalds 		enqueue_huge_page(page);
1151da177e4SLinus Torvalds 		spin_unlock(&hugetlb_lock);
1161da177e4SLinus Torvalds 	}
1171da177e4SLinus Torvalds 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
1181da177e4SLinus Torvalds 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
1191da177e4SLinus Torvalds 	return 0;
1201da177e4SLinus Torvalds }
1211da177e4SLinus Torvalds module_init(hugetlb_init);
1221da177e4SLinus Torvalds 
1231da177e4SLinus Torvalds static int __init hugetlb_setup(char *s)
1241da177e4SLinus Torvalds {
1251da177e4SLinus Torvalds 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
1261da177e4SLinus Torvalds 		max_huge_pages = 0;
1271da177e4SLinus Torvalds 	return 1;
1281da177e4SLinus Torvalds }
1291da177e4SLinus Torvalds __setup("hugepages=", hugetlb_setup);
1301da177e4SLinus Torvalds 
1311da177e4SLinus Torvalds #ifdef CONFIG_SYSCTL
1321da177e4SLinus Torvalds static void update_and_free_page(struct page *page)
1331da177e4SLinus Torvalds {
1341da177e4SLinus Torvalds 	int i;
1351da177e4SLinus Torvalds 	nr_huge_pages--;
1361da177e4SLinus Torvalds 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
1371da177e4SLinus Torvalds 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
1381da177e4SLinus Torvalds 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1391da177e4SLinus Torvalds 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1401da177e4SLinus Torvalds 				1 << PG_private | 1<< PG_writeback);
1411da177e4SLinus Torvalds 		set_page_count(&page[i], 0);
1421da177e4SLinus Torvalds 	}
1431da177e4SLinus Torvalds 	set_page_count(page, 1);
1441da177e4SLinus Torvalds 	__free_pages(page, HUGETLB_PAGE_ORDER);
1451da177e4SLinus Torvalds }
1461da177e4SLinus Torvalds 
1471da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
1481da177e4SLinus Torvalds static void try_to_free_low(unsigned long count)
1491da177e4SLinus Torvalds {
1501da177e4SLinus Torvalds 	int i, nid;
1511da177e4SLinus Torvalds 	for (i = 0; i < MAX_NUMNODES; ++i) {
1521da177e4SLinus Torvalds 		struct page *page, *next;
1531da177e4SLinus Torvalds 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
1541da177e4SLinus Torvalds 			if (PageHighMem(page))
1551da177e4SLinus Torvalds 				continue;
1561da177e4SLinus Torvalds 			list_del(&page->lru);
1571da177e4SLinus Torvalds 			update_and_free_page(page);
1581da177e4SLinus Torvalds 			nid = page_zone(page)->zone_pgdat->node_id;
1591da177e4SLinus Torvalds 			free_huge_pages--;
1601da177e4SLinus Torvalds 			free_huge_pages_node[nid]--;
1611da177e4SLinus Torvalds 			if (count >= nr_huge_pages)
1621da177e4SLinus Torvalds 				return;
1631da177e4SLinus Torvalds 		}
1641da177e4SLinus Torvalds 	}
1651da177e4SLinus Torvalds }
1661da177e4SLinus Torvalds #else
1671da177e4SLinus Torvalds static inline void try_to_free_low(unsigned long count)
1681da177e4SLinus Torvalds {
1691da177e4SLinus Torvalds }
1701da177e4SLinus Torvalds #endif
1711da177e4SLinus Torvalds 
1721da177e4SLinus Torvalds static unsigned long set_max_huge_pages(unsigned long count)
1731da177e4SLinus Torvalds {
1741da177e4SLinus Torvalds 	while (count > nr_huge_pages) {
1751da177e4SLinus Torvalds 		struct page *page = alloc_fresh_huge_page();
1761da177e4SLinus Torvalds 		if (!page)
1771da177e4SLinus Torvalds 			return nr_huge_pages;
1781da177e4SLinus Torvalds 		spin_lock(&hugetlb_lock);
1791da177e4SLinus Torvalds 		enqueue_huge_page(page);
1801da177e4SLinus Torvalds 		spin_unlock(&hugetlb_lock);
1811da177e4SLinus Torvalds 	}
1821da177e4SLinus Torvalds 	if (count >= nr_huge_pages)
1831da177e4SLinus Torvalds 		return nr_huge_pages;
1841da177e4SLinus Torvalds 
1851da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
1861da177e4SLinus Torvalds 	try_to_free_low(count);
1871da177e4SLinus Torvalds 	while (count < nr_huge_pages) {
1881da177e4SLinus Torvalds 		struct page *page = dequeue_huge_page();
1891da177e4SLinus Torvalds 		if (!page)
1901da177e4SLinus Torvalds 			break;
1911da177e4SLinus Torvalds 		update_and_free_page(page);
1921da177e4SLinus Torvalds 	}
1931da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
1941da177e4SLinus Torvalds 	return nr_huge_pages;
1951da177e4SLinus Torvalds }
1961da177e4SLinus Torvalds 
1971da177e4SLinus Torvalds int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1981da177e4SLinus Torvalds 			   struct file *file, void __user *buffer,
1991da177e4SLinus Torvalds 			   size_t *length, loff_t *ppos)
2001da177e4SLinus Torvalds {
2011da177e4SLinus Torvalds 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2021da177e4SLinus Torvalds 	max_huge_pages = set_max_huge_pages(max_huge_pages);
2031da177e4SLinus Torvalds 	return 0;
2041da177e4SLinus Torvalds }
2051da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
2061da177e4SLinus Torvalds 
2071da177e4SLinus Torvalds int hugetlb_report_meminfo(char *buf)
2081da177e4SLinus Torvalds {
2091da177e4SLinus Torvalds 	return sprintf(buf,
2101da177e4SLinus Torvalds 			"HugePages_Total: %5lu\n"
2111da177e4SLinus Torvalds 			"HugePages_Free:  %5lu\n"
2121da177e4SLinus Torvalds 			"Hugepagesize:    %5lu kB\n",
2131da177e4SLinus Torvalds 			nr_huge_pages,
2141da177e4SLinus Torvalds 			free_huge_pages,
2151da177e4SLinus Torvalds 			HPAGE_SIZE/1024);
2161da177e4SLinus Torvalds }
2171da177e4SLinus Torvalds 
2181da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
2191da177e4SLinus Torvalds {
2201da177e4SLinus Torvalds 	return sprintf(buf,
2211da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
2221da177e4SLinus Torvalds 		"Node %d HugePages_Free:  %5u\n",
2231da177e4SLinus Torvalds 		nid, nr_huge_pages_node[nid],
2241da177e4SLinus Torvalds 		nid, free_huge_pages_node[nid]);
2251da177e4SLinus Torvalds }
2261da177e4SLinus Torvalds 
2271da177e4SLinus Torvalds int is_hugepage_mem_enough(size_t size)
2281da177e4SLinus Torvalds {
2291da177e4SLinus Torvalds 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
2301da177e4SLinus Torvalds }
2311da177e4SLinus Torvalds 
2321da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2331da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
2341da177e4SLinus Torvalds {
2351da177e4SLinus Torvalds 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
2361da177e4SLinus Torvalds }
2371da177e4SLinus Torvalds EXPORT_SYMBOL(hugetlb_total_pages);
2381da177e4SLinus Torvalds 
2391da177e4SLinus Torvalds /*
2401da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
2411da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
2421da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
2431da177e4SLinus Torvalds  * this far.
2441da177e4SLinus Torvalds  */
2451da177e4SLinus Torvalds static struct page *hugetlb_nopage(struct vm_area_struct *vma,
2461da177e4SLinus Torvalds 				unsigned long address, int *unused)
2471da177e4SLinus Torvalds {
2481da177e4SLinus Torvalds 	BUG();
2491da177e4SLinus Torvalds 	return NULL;
2501da177e4SLinus Torvalds }
2511da177e4SLinus Torvalds 
2521da177e4SLinus Torvalds struct vm_operations_struct hugetlb_vm_ops = {
2531da177e4SLinus Torvalds 	.nopage = hugetlb_nopage,
2541da177e4SLinus Torvalds };
2551da177e4SLinus Torvalds 
25663551ae0SDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
25763551ae0SDavid Gibson {
25863551ae0SDavid Gibson 	pte_t entry;
25963551ae0SDavid Gibson 
26063551ae0SDavid Gibson 	if (vma->vm_flags & VM_WRITE) {
26163551ae0SDavid Gibson 		entry =
26263551ae0SDavid Gibson 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
26363551ae0SDavid Gibson 	} else {
26463551ae0SDavid Gibson 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
26563551ae0SDavid Gibson 	}
26663551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
26763551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
26863551ae0SDavid Gibson 
26963551ae0SDavid Gibson 	return entry;
27063551ae0SDavid Gibson }
27163551ae0SDavid Gibson 
27263551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
27363551ae0SDavid Gibson 			    struct vm_area_struct *vma)
27463551ae0SDavid Gibson {
27563551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
27663551ae0SDavid Gibson 	struct page *ptepage;
27763551ae0SDavid Gibson 	unsigned long addr = vma->vm_start;
27863551ae0SDavid Gibson 	unsigned long end = vma->vm_end;
27963551ae0SDavid Gibson 
28063551ae0SDavid Gibson 	while (addr < end) {
28163551ae0SDavid Gibson 		dst_pte = huge_pte_alloc(dst, addr);
28263551ae0SDavid Gibson 		if (!dst_pte)
28363551ae0SDavid Gibson 			goto nomem;
28463551ae0SDavid Gibson 		src_pte = huge_pte_offset(src, addr);
28563551ae0SDavid Gibson 		BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
28663551ae0SDavid Gibson 		entry = *src_pte;
28763551ae0SDavid Gibson 		ptepage = pte_page(entry);
28863551ae0SDavid Gibson 		get_page(ptepage);
28963551ae0SDavid Gibson 		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
29063551ae0SDavid Gibson 		set_huge_pte_at(dst, addr, dst_pte, entry);
29163551ae0SDavid Gibson 		addr += HPAGE_SIZE;
29263551ae0SDavid Gibson 	}
29363551ae0SDavid Gibson 	return 0;
29463551ae0SDavid Gibson 
29563551ae0SDavid Gibson nomem:
29663551ae0SDavid Gibson 	return -ENOMEM;
29763551ae0SDavid Gibson }
29863551ae0SDavid Gibson 
29963551ae0SDavid Gibson void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
30063551ae0SDavid Gibson 			  unsigned long end)
30163551ae0SDavid Gibson {
30263551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
30363551ae0SDavid Gibson 	unsigned long address;
304c7546f8fSDavid Gibson 	pte_t *ptep;
30563551ae0SDavid Gibson 	pte_t pte;
30663551ae0SDavid Gibson 	struct page *page;
30763551ae0SDavid Gibson 
30863551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
30963551ae0SDavid Gibson 	BUG_ON(start & ~HPAGE_MASK);
31063551ae0SDavid Gibson 	BUG_ON(end & ~HPAGE_MASK);
31163551ae0SDavid Gibson 
31263551ae0SDavid Gibson 	for (address = start; address < end; address += HPAGE_SIZE) {
313c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
314c7546f8fSDavid Gibson 		if (! ptep)
315c7546f8fSDavid Gibson 			/* This can happen on truncate, or if an
316c7546f8fSDavid Gibson 			 * mmap() is aborted due to an error before
317c7546f8fSDavid Gibson 			 * the prefault */
318c7546f8fSDavid Gibson 			continue;
319c7546f8fSDavid Gibson 
320c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
32163551ae0SDavid Gibson 		if (pte_none(pte))
32263551ae0SDavid Gibson 			continue;
323c7546f8fSDavid Gibson 
32463551ae0SDavid Gibson 		page = pte_page(pte);
32563551ae0SDavid Gibson 		put_page(page);
32663551ae0SDavid Gibson 	}
32763551ae0SDavid Gibson 	add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
32863551ae0SDavid Gibson 	flush_tlb_range(vma, start, end);
32963551ae0SDavid Gibson }
33063551ae0SDavid Gibson 
3311da177e4SLinus Torvalds void zap_hugepage_range(struct vm_area_struct *vma,
3321da177e4SLinus Torvalds 			unsigned long start, unsigned long length)
3331da177e4SLinus Torvalds {
3341da177e4SLinus Torvalds 	struct mm_struct *mm = vma->vm_mm;
3351da177e4SLinus Torvalds 
3361da177e4SLinus Torvalds 	spin_lock(&mm->page_table_lock);
3371da177e4SLinus Torvalds 	unmap_hugepage_range(vma, start, start + length);
3381da177e4SLinus Torvalds 	spin_unlock(&mm->page_table_lock);
3391da177e4SLinus Torvalds }
34063551ae0SDavid Gibson 
34163551ae0SDavid Gibson int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
34263551ae0SDavid Gibson {
34363551ae0SDavid Gibson 	struct mm_struct *mm = current->mm;
34463551ae0SDavid Gibson 	unsigned long addr;
34563551ae0SDavid Gibson 	int ret = 0;
34663551ae0SDavid Gibson 
34763551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
34863551ae0SDavid Gibson 	BUG_ON(vma->vm_start & ~HPAGE_MASK);
34963551ae0SDavid Gibson 	BUG_ON(vma->vm_end & ~HPAGE_MASK);
35063551ae0SDavid Gibson 
35163551ae0SDavid Gibson 	hugetlb_prefault_arch_hook(mm);
35263551ae0SDavid Gibson 
35363551ae0SDavid Gibson 	spin_lock(&mm->page_table_lock);
35463551ae0SDavid Gibson 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
35563551ae0SDavid Gibson 		unsigned long idx;
35663551ae0SDavid Gibson 		pte_t *pte = huge_pte_alloc(mm, addr);
35763551ae0SDavid Gibson 		struct page *page;
35863551ae0SDavid Gibson 
35963551ae0SDavid Gibson 		if (!pte) {
36063551ae0SDavid Gibson 			ret = -ENOMEM;
36163551ae0SDavid Gibson 			goto out;
36263551ae0SDavid Gibson 		}
36363551ae0SDavid Gibson 		if (! pte_none(*pte))
36463551ae0SDavid Gibson 			hugetlb_clean_stale_pgtable(pte);
36563551ae0SDavid Gibson 
36663551ae0SDavid Gibson 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
36763551ae0SDavid Gibson 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
36863551ae0SDavid Gibson 		page = find_get_page(mapping, idx);
36963551ae0SDavid Gibson 		if (!page) {
37063551ae0SDavid Gibson 			/* charge the fs quota first */
37163551ae0SDavid Gibson 			if (hugetlb_get_quota(mapping)) {
37263551ae0SDavid Gibson 				ret = -ENOMEM;
37363551ae0SDavid Gibson 				goto out;
37463551ae0SDavid Gibson 			}
37563551ae0SDavid Gibson 			page = alloc_huge_page();
37663551ae0SDavid Gibson 			if (!page) {
37763551ae0SDavid Gibson 				hugetlb_put_quota(mapping);
37863551ae0SDavid Gibson 				ret = -ENOMEM;
37963551ae0SDavid Gibson 				goto out;
38063551ae0SDavid Gibson 			}
38163551ae0SDavid Gibson 			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
38263551ae0SDavid Gibson 			if (! ret) {
38363551ae0SDavid Gibson 				unlock_page(page);
38463551ae0SDavid Gibson 			} else {
38563551ae0SDavid Gibson 				hugetlb_put_quota(mapping);
38663551ae0SDavid Gibson 				free_huge_page(page);
38763551ae0SDavid Gibson 				goto out;
38863551ae0SDavid Gibson 			}
38963551ae0SDavid Gibson 		}
39063551ae0SDavid Gibson 		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
39163551ae0SDavid Gibson 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
39263551ae0SDavid Gibson 	}
39363551ae0SDavid Gibson out:
39463551ae0SDavid Gibson 	spin_unlock(&mm->page_table_lock);
39563551ae0SDavid Gibson 	return ret;
39663551ae0SDavid Gibson }
39763551ae0SDavid Gibson 
39863551ae0SDavid Gibson int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
39963551ae0SDavid Gibson 			struct page **pages, struct vm_area_struct **vmas,
40063551ae0SDavid Gibson 			unsigned long *position, int *length, int i)
40163551ae0SDavid Gibson {
40263551ae0SDavid Gibson 	unsigned long vpfn, vaddr = *position;
40363551ae0SDavid Gibson 	int remainder = *length;
40463551ae0SDavid Gibson 
40563551ae0SDavid Gibson 	BUG_ON(!is_vm_hugetlb_page(vma));
40663551ae0SDavid Gibson 
40763551ae0SDavid Gibson 	vpfn = vaddr/PAGE_SIZE;
40863551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
40963551ae0SDavid Gibson 
41063551ae0SDavid Gibson 		if (pages) {
41163551ae0SDavid Gibson 			pte_t *pte;
41263551ae0SDavid Gibson 			struct page *page;
41363551ae0SDavid Gibson 
41463551ae0SDavid Gibson 			/* Some archs (sparc64, sh*) have multiple
41563551ae0SDavid Gibson 			 * pte_ts to each hugepage.  We have to make
41663551ae0SDavid Gibson 			 * sure we get the first, for the page
41763551ae0SDavid Gibson 			 * indexing below to work. */
41863551ae0SDavid Gibson 			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
41963551ae0SDavid Gibson 
42063551ae0SDavid Gibson 			/* hugetlb should be locked, and hence, prefaulted */
42163551ae0SDavid Gibson 			WARN_ON(!pte || pte_none(*pte));
42263551ae0SDavid Gibson 
42363551ae0SDavid Gibson 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
42463551ae0SDavid Gibson 
42563551ae0SDavid Gibson 			WARN_ON(!PageCompound(page));
42663551ae0SDavid Gibson 
42763551ae0SDavid Gibson 			get_page(page);
42863551ae0SDavid Gibson 			pages[i] = page;
42963551ae0SDavid Gibson 		}
43063551ae0SDavid Gibson 
43163551ae0SDavid Gibson 		if (vmas)
43263551ae0SDavid Gibson 			vmas[i] = vma;
43363551ae0SDavid Gibson 
43463551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
43563551ae0SDavid Gibson 		++vpfn;
43663551ae0SDavid Gibson 		--remainder;
43763551ae0SDavid Gibson 		++i;
43863551ae0SDavid Gibson 	}
43963551ae0SDavid Gibson 
44063551ae0SDavid Gibson 	*length = remainder;
44163551ae0SDavid Gibson 	*position = vaddr;
44263551ae0SDavid Gibson 
44363551ae0SDavid Gibson 	return i;
44463551ae0SDavid Gibson }
445