xref: /openbmc/linux/mm/hugetlb.c (revision 8f860591)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
31da177e4SLinus Torvalds  * (C) William Irwin, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/gfp.h>
61da177e4SLinus Torvalds #include <linux/list.h>
71da177e4SLinus Torvalds #include <linux/init.h>
81da177e4SLinus Torvalds #include <linux/module.h>
91da177e4SLinus Torvalds #include <linux/mm.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
121da177e4SLinus Torvalds #include <linux/nodemask.h>
1363551ae0SDavid Gibson #include <linux/pagemap.h>
145da7ca86SChristoph Lameter #include <linux/mempolicy.h>
15aea47ff3SChristoph Lameter #include <linux/cpuset.h>
165da7ca86SChristoph Lameter 
1763551ae0SDavid Gibson #include <asm/page.h>
1863551ae0SDavid Gibson #include <asm/pgtable.h>
1963551ae0SDavid Gibson 
2063551ae0SDavid Gibson #include <linux/hugetlb.h>
217835e98bSNick Piggin #include "internal.h"
221da177e4SLinus Torvalds 
231da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
241da177e4SLinus Torvalds static unsigned long nr_huge_pages, free_huge_pages;
251da177e4SLinus Torvalds unsigned long max_huge_pages;
261da177e4SLinus Torvalds static struct list_head hugepage_freelists[MAX_NUMNODES];
271da177e4SLinus Torvalds static unsigned int nr_huge_pages_node[MAX_NUMNODES];
281da177e4SLinus Torvalds static unsigned int free_huge_pages_node[MAX_NUMNODES];
290bd0f9fbSEric Paris 
300bd0f9fbSEric Paris /*
310bd0f9fbSEric Paris  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
320bd0f9fbSEric Paris  */
331da177e4SLinus Torvalds static DEFINE_SPINLOCK(hugetlb_lock);
341da177e4SLinus Torvalds 
351da177e4SLinus Torvalds static void enqueue_huge_page(struct page *page)
361da177e4SLinus Torvalds {
371da177e4SLinus Torvalds 	int nid = page_to_nid(page);
381da177e4SLinus Torvalds 	list_add(&page->lru, &hugepage_freelists[nid]);
391da177e4SLinus Torvalds 	free_huge_pages++;
401da177e4SLinus Torvalds 	free_huge_pages_node[nid]++;
411da177e4SLinus Torvalds }
421da177e4SLinus Torvalds 
435da7ca86SChristoph Lameter static struct page *dequeue_huge_page(struct vm_area_struct *vma,
445da7ca86SChristoph Lameter 				unsigned long address)
451da177e4SLinus Torvalds {
461da177e4SLinus Torvalds 	int nid = numa_node_id();
471da177e4SLinus Torvalds 	struct page *page = NULL;
485da7ca86SChristoph Lameter 	struct zonelist *zonelist = huge_zonelist(vma, address);
4996df9333SChristoph Lameter 	struct zone **z;
501da177e4SLinus Torvalds 
5196df9333SChristoph Lameter 	for (z = zonelist->zones; *z; z++) {
5296df9333SChristoph Lameter 		nid = (*z)->zone_pgdat->node_id;
53aea47ff3SChristoph Lameter 		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
54aea47ff3SChristoph Lameter 		    !list_empty(&hugepage_freelists[nid]))
551da177e4SLinus Torvalds 			break;
561da177e4SLinus Torvalds 	}
5796df9333SChristoph Lameter 
5896df9333SChristoph Lameter 	if (*z) {
591da177e4SLinus Torvalds 		page = list_entry(hugepage_freelists[nid].next,
601da177e4SLinus Torvalds 				  struct page, lru);
611da177e4SLinus Torvalds 		list_del(&page->lru);
621da177e4SLinus Torvalds 		free_huge_pages--;
631da177e4SLinus Torvalds 		free_huge_pages_node[nid]--;
641da177e4SLinus Torvalds 	}
651da177e4SLinus Torvalds 	return page;
661da177e4SLinus Torvalds }
671da177e4SLinus Torvalds 
68a482289dSNick Piggin static int alloc_fresh_huge_page(void)
691da177e4SLinus Torvalds {
701da177e4SLinus Torvalds 	static int nid = 0;
711da177e4SLinus Torvalds 	struct page *page;
721da177e4SLinus Torvalds 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
731da177e4SLinus Torvalds 					HUGETLB_PAGE_ORDER);
741da177e4SLinus Torvalds 	nid = (nid + 1) % num_online_nodes();
751da177e4SLinus Torvalds 	if (page) {
76a482289dSNick Piggin 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
770bd0f9fbSEric Paris 		spin_lock(&hugetlb_lock);
781da177e4SLinus Torvalds 		nr_huge_pages++;
791da177e4SLinus Torvalds 		nr_huge_pages_node[page_to_nid(page)]++;
800bd0f9fbSEric Paris 		spin_unlock(&hugetlb_lock);
81a482289dSNick Piggin 		put_page(page); /* free it into the hugepage allocator */
82a482289dSNick Piggin 		return 1;
831da177e4SLinus Torvalds 	}
84a482289dSNick Piggin 	return 0;
851da177e4SLinus Torvalds }
861da177e4SLinus Torvalds 
871da177e4SLinus Torvalds void free_huge_page(struct page *page)
881da177e4SLinus Torvalds {
891da177e4SLinus Torvalds 	BUG_ON(page_count(page));
901da177e4SLinus Torvalds 
911da177e4SLinus Torvalds 	INIT_LIST_HEAD(&page->lru);
921da177e4SLinus Torvalds 
931da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
941da177e4SLinus Torvalds 	enqueue_huge_page(page);
951da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
961da177e4SLinus Torvalds }
971da177e4SLinus Torvalds 
985da7ca86SChristoph Lameter struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
991da177e4SLinus Torvalds {
1001da177e4SLinus Torvalds 	struct page *page;
1011da177e4SLinus Torvalds 	int i;
1021da177e4SLinus Torvalds 
1031da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
1045da7ca86SChristoph Lameter 	page = dequeue_huge_page(vma, addr);
1051da177e4SLinus Torvalds 	if (!page) {
1061da177e4SLinus Torvalds 		spin_unlock(&hugetlb_lock);
1071da177e4SLinus Torvalds 		return NULL;
1081da177e4SLinus Torvalds 	}
1091da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
1107835e98bSNick Piggin 	set_page_refcounted(page);
1111da177e4SLinus Torvalds 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
112a2dfef69SDavid Gibson 		clear_user_highpage(&page[i], addr);
1131da177e4SLinus Torvalds 	return page;
1141da177e4SLinus Torvalds }
1151da177e4SLinus Torvalds 
1161da177e4SLinus Torvalds static int __init hugetlb_init(void)
1171da177e4SLinus Torvalds {
1181da177e4SLinus Torvalds 	unsigned long i;
1191da177e4SLinus Torvalds 
1203c726f8dSBenjamin Herrenschmidt 	if (HPAGE_SHIFT == 0)
1213c726f8dSBenjamin Herrenschmidt 		return 0;
1223c726f8dSBenjamin Herrenschmidt 
1231da177e4SLinus Torvalds 	for (i = 0; i < MAX_NUMNODES; ++i)
1241da177e4SLinus Torvalds 		INIT_LIST_HEAD(&hugepage_freelists[i]);
1251da177e4SLinus Torvalds 
1261da177e4SLinus Torvalds 	for (i = 0; i < max_huge_pages; ++i) {
127a482289dSNick Piggin 		if (!alloc_fresh_huge_page())
1281da177e4SLinus Torvalds 			break;
1291da177e4SLinus Torvalds 	}
1301da177e4SLinus Torvalds 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
1311da177e4SLinus Torvalds 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
1321da177e4SLinus Torvalds 	return 0;
1331da177e4SLinus Torvalds }
1341da177e4SLinus Torvalds module_init(hugetlb_init);
1351da177e4SLinus Torvalds 
1361da177e4SLinus Torvalds static int __init hugetlb_setup(char *s)
1371da177e4SLinus Torvalds {
1381da177e4SLinus Torvalds 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
1391da177e4SLinus Torvalds 		max_huge_pages = 0;
1401da177e4SLinus Torvalds 	return 1;
1411da177e4SLinus Torvalds }
1421da177e4SLinus Torvalds __setup("hugepages=", hugetlb_setup);
1431da177e4SLinus Torvalds 
1441da177e4SLinus Torvalds #ifdef CONFIG_SYSCTL
1451da177e4SLinus Torvalds static void update_and_free_page(struct page *page)
1461da177e4SLinus Torvalds {
1471da177e4SLinus Torvalds 	int i;
1481da177e4SLinus Torvalds 	nr_huge_pages--;
1491da177e4SLinus Torvalds 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
1501da177e4SLinus Torvalds 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
1511da177e4SLinus Torvalds 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1521da177e4SLinus Torvalds 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1531da177e4SLinus Torvalds 				1 << PG_private | 1<< PG_writeback);
1541da177e4SLinus Torvalds 	}
155a482289dSNick Piggin 	page[1].lru.next = NULL;
1567835e98bSNick Piggin 	set_page_refcounted(page);
1571da177e4SLinus Torvalds 	__free_pages(page, HUGETLB_PAGE_ORDER);
1581da177e4SLinus Torvalds }
1591da177e4SLinus Torvalds 
1601da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
1611da177e4SLinus Torvalds static void try_to_free_low(unsigned long count)
1621da177e4SLinus Torvalds {
1631da177e4SLinus Torvalds 	int i, nid;
1641da177e4SLinus Torvalds 	for (i = 0; i < MAX_NUMNODES; ++i) {
1651da177e4SLinus Torvalds 		struct page *page, *next;
1661da177e4SLinus Torvalds 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
1671da177e4SLinus Torvalds 			if (PageHighMem(page))
1681da177e4SLinus Torvalds 				continue;
1691da177e4SLinus Torvalds 			list_del(&page->lru);
1701da177e4SLinus Torvalds 			update_and_free_page(page);
1711da177e4SLinus Torvalds 			nid = page_zone(page)->zone_pgdat->node_id;
1721da177e4SLinus Torvalds 			free_huge_pages--;
1731da177e4SLinus Torvalds 			free_huge_pages_node[nid]--;
1741da177e4SLinus Torvalds 			if (count >= nr_huge_pages)
1751da177e4SLinus Torvalds 				return;
1761da177e4SLinus Torvalds 		}
1771da177e4SLinus Torvalds 	}
1781da177e4SLinus Torvalds }
1791da177e4SLinus Torvalds #else
1801da177e4SLinus Torvalds static inline void try_to_free_low(unsigned long count)
1811da177e4SLinus Torvalds {
1821da177e4SLinus Torvalds }
1831da177e4SLinus Torvalds #endif
1841da177e4SLinus Torvalds 
1851da177e4SLinus Torvalds static unsigned long set_max_huge_pages(unsigned long count)
1861da177e4SLinus Torvalds {
1871da177e4SLinus Torvalds 	while (count > nr_huge_pages) {
188a482289dSNick Piggin 		if (!alloc_fresh_huge_page())
1891da177e4SLinus Torvalds 			return nr_huge_pages;
1901da177e4SLinus Torvalds 	}
1911da177e4SLinus Torvalds 	if (count >= nr_huge_pages)
1921da177e4SLinus Torvalds 		return nr_huge_pages;
1931da177e4SLinus Torvalds 
1941da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
1951da177e4SLinus Torvalds 	try_to_free_low(count);
1961da177e4SLinus Torvalds 	while (count < nr_huge_pages) {
1975da7ca86SChristoph Lameter 		struct page *page = dequeue_huge_page(NULL, 0);
1981da177e4SLinus Torvalds 		if (!page)
1991da177e4SLinus Torvalds 			break;
2001da177e4SLinus Torvalds 		update_and_free_page(page);
2011da177e4SLinus Torvalds 	}
2021da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
2031da177e4SLinus Torvalds 	return nr_huge_pages;
2041da177e4SLinus Torvalds }
2051da177e4SLinus Torvalds 
2061da177e4SLinus Torvalds int hugetlb_sysctl_handler(struct ctl_table *table, int write,
2071da177e4SLinus Torvalds 			   struct file *file, void __user *buffer,
2081da177e4SLinus Torvalds 			   size_t *length, loff_t *ppos)
2091da177e4SLinus Torvalds {
2101da177e4SLinus Torvalds 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2111da177e4SLinus Torvalds 	max_huge_pages = set_max_huge_pages(max_huge_pages);
2121da177e4SLinus Torvalds 	return 0;
2131da177e4SLinus Torvalds }
2141da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
2151da177e4SLinus Torvalds 
2161da177e4SLinus Torvalds int hugetlb_report_meminfo(char *buf)
2171da177e4SLinus Torvalds {
2181da177e4SLinus Torvalds 	return sprintf(buf,
2191da177e4SLinus Torvalds 			"HugePages_Total: %5lu\n"
2201da177e4SLinus Torvalds 			"HugePages_Free:  %5lu\n"
2211da177e4SLinus Torvalds 			"Hugepagesize:    %5lu kB\n",
2221da177e4SLinus Torvalds 			nr_huge_pages,
2231da177e4SLinus Torvalds 			free_huge_pages,
2241da177e4SLinus Torvalds 			HPAGE_SIZE/1024);
2251da177e4SLinus Torvalds }
2261da177e4SLinus Torvalds 
2271da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
2281da177e4SLinus Torvalds {
2291da177e4SLinus Torvalds 	return sprintf(buf,
2301da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
2311da177e4SLinus Torvalds 		"Node %d HugePages_Free:  %5u\n",
2321da177e4SLinus Torvalds 		nid, nr_huge_pages_node[nid],
2331da177e4SLinus Torvalds 		nid, free_huge_pages_node[nid]);
2341da177e4SLinus Torvalds }
2351da177e4SLinus Torvalds 
2361da177e4SLinus Torvalds int is_hugepage_mem_enough(size_t size)
2371da177e4SLinus Torvalds {
2381da177e4SLinus Torvalds 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
2391da177e4SLinus Torvalds }
2401da177e4SLinus Torvalds 
2411da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2421da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
2431da177e4SLinus Torvalds {
2441da177e4SLinus Torvalds 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
2451da177e4SLinus Torvalds }
2461da177e4SLinus Torvalds 
2471da177e4SLinus Torvalds /*
2481da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
2491da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
2501da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
2511da177e4SLinus Torvalds  * this far.
2521da177e4SLinus Torvalds  */
2531da177e4SLinus Torvalds static struct page *hugetlb_nopage(struct vm_area_struct *vma,
2541da177e4SLinus Torvalds 				unsigned long address, int *unused)
2551da177e4SLinus Torvalds {
2561da177e4SLinus Torvalds 	BUG();
2571da177e4SLinus Torvalds 	return NULL;
2581da177e4SLinus Torvalds }
2591da177e4SLinus Torvalds 
2601da177e4SLinus Torvalds struct vm_operations_struct hugetlb_vm_ops = {
2611da177e4SLinus Torvalds 	.nopage = hugetlb_nopage,
2621da177e4SLinus Torvalds };
2631da177e4SLinus Torvalds 
2641e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2651e8f889bSDavid Gibson 				int writable)
26663551ae0SDavid Gibson {
26763551ae0SDavid Gibson 	pte_t entry;
26863551ae0SDavid Gibson 
2691e8f889bSDavid Gibson 	if (writable) {
27063551ae0SDavid Gibson 		entry =
27163551ae0SDavid Gibson 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
27263551ae0SDavid Gibson 	} else {
27363551ae0SDavid Gibson 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
27463551ae0SDavid Gibson 	}
27563551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
27663551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
27763551ae0SDavid Gibson 
27863551ae0SDavid Gibson 	return entry;
27963551ae0SDavid Gibson }
28063551ae0SDavid Gibson 
2811e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
2821e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
2831e8f889bSDavid Gibson {
2841e8f889bSDavid Gibson 	pte_t entry;
2851e8f889bSDavid Gibson 
2861e8f889bSDavid Gibson 	entry = pte_mkwrite(pte_mkdirty(*ptep));
2871e8f889bSDavid Gibson 	ptep_set_access_flags(vma, address, ptep, entry, 1);
2881e8f889bSDavid Gibson 	update_mmu_cache(vma, address, entry);
2891e8f889bSDavid Gibson 	lazy_mmu_prot_update(entry);
2901e8f889bSDavid Gibson }
2911e8f889bSDavid Gibson 
2921e8f889bSDavid Gibson 
29363551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
29463551ae0SDavid Gibson 			    struct vm_area_struct *vma)
29563551ae0SDavid Gibson {
29663551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
29763551ae0SDavid Gibson 	struct page *ptepage;
2981c59827dSHugh Dickins 	unsigned long addr;
2991e8f889bSDavid Gibson 	int cow;
3001e8f889bSDavid Gibson 
3011e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
30263551ae0SDavid Gibson 
3031c59827dSHugh Dickins 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
304c74df32cSHugh Dickins 		src_pte = huge_pte_offset(src, addr);
305c74df32cSHugh Dickins 		if (!src_pte)
306c74df32cSHugh Dickins 			continue;
30763551ae0SDavid Gibson 		dst_pte = huge_pte_alloc(dst, addr);
30863551ae0SDavid Gibson 		if (!dst_pte)
30963551ae0SDavid Gibson 			goto nomem;
310c74df32cSHugh Dickins 		spin_lock(&dst->page_table_lock);
3111c59827dSHugh Dickins 		spin_lock(&src->page_table_lock);
312c74df32cSHugh Dickins 		if (!pte_none(*src_pte)) {
3131e8f889bSDavid Gibson 			if (cow)
3141e8f889bSDavid Gibson 				ptep_set_wrprotect(src, addr, src_pte);
31563551ae0SDavid Gibson 			entry = *src_pte;
31663551ae0SDavid Gibson 			ptepage = pte_page(entry);
31763551ae0SDavid Gibson 			get_page(ptepage);
3184294621fSHugh Dickins 			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
31963551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
3201c59827dSHugh Dickins 		}
3211c59827dSHugh Dickins 		spin_unlock(&src->page_table_lock);
322c74df32cSHugh Dickins 		spin_unlock(&dst->page_table_lock);
32363551ae0SDavid Gibson 	}
32463551ae0SDavid Gibson 	return 0;
32563551ae0SDavid Gibson 
32663551ae0SDavid Gibson nomem:
32763551ae0SDavid Gibson 	return -ENOMEM;
32863551ae0SDavid Gibson }
32963551ae0SDavid Gibson 
33063551ae0SDavid Gibson void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
33163551ae0SDavid Gibson 			  unsigned long end)
33263551ae0SDavid Gibson {
33363551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
33463551ae0SDavid Gibson 	unsigned long address;
335c7546f8fSDavid Gibson 	pte_t *ptep;
33663551ae0SDavid Gibson 	pte_t pte;
33763551ae0SDavid Gibson 	struct page *page;
33863551ae0SDavid Gibson 
33963551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
34063551ae0SDavid Gibson 	BUG_ON(start & ~HPAGE_MASK);
34163551ae0SDavid Gibson 	BUG_ON(end & ~HPAGE_MASK);
34263551ae0SDavid Gibson 
343508034a3SHugh Dickins 	spin_lock(&mm->page_table_lock);
344508034a3SHugh Dickins 
345365e9c87SHugh Dickins 	/* Update high watermark before we lower rss */
346365e9c87SHugh Dickins 	update_hiwater_rss(mm);
347365e9c87SHugh Dickins 
34863551ae0SDavid Gibson 	for (address = start; address < end; address += HPAGE_SIZE) {
349c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
350c7546f8fSDavid Gibson 		if (!ptep)
351c7546f8fSDavid Gibson 			continue;
352c7546f8fSDavid Gibson 
353c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
35463551ae0SDavid Gibson 		if (pte_none(pte))
35563551ae0SDavid Gibson 			continue;
356c7546f8fSDavid Gibson 
35763551ae0SDavid Gibson 		page = pte_page(pte);
35863551ae0SDavid Gibson 		put_page(page);
3594294621fSHugh Dickins 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
36063551ae0SDavid Gibson 	}
36163551ae0SDavid Gibson 
3621da177e4SLinus Torvalds 	spin_unlock(&mm->page_table_lock);
363508034a3SHugh Dickins 	flush_tlb_range(vma, start, end);
3641da177e4SLinus Torvalds }
36563551ae0SDavid Gibson 
3661e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
3671e8f889bSDavid Gibson 			unsigned long address, pte_t *ptep, pte_t pte)
3681e8f889bSDavid Gibson {
3691e8f889bSDavid Gibson 	struct page *old_page, *new_page;
3701e8f889bSDavid Gibson 	int i, avoidcopy;
3711e8f889bSDavid Gibson 
3721e8f889bSDavid Gibson 	old_page = pte_page(pte);
3731e8f889bSDavid Gibson 
3741e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
3751e8f889bSDavid Gibson 	 * and just make the page writable */
3761e8f889bSDavid Gibson 	avoidcopy = (page_count(old_page) == 1);
3771e8f889bSDavid Gibson 	if (avoidcopy) {
3781e8f889bSDavid Gibson 		set_huge_ptep_writable(vma, address, ptep);
3791e8f889bSDavid Gibson 		return VM_FAULT_MINOR;
3801e8f889bSDavid Gibson 	}
3811e8f889bSDavid Gibson 
3821e8f889bSDavid Gibson 	page_cache_get(old_page);
3835da7ca86SChristoph Lameter 	new_page = alloc_huge_page(vma, address);
3841e8f889bSDavid Gibson 
3851e8f889bSDavid Gibson 	if (!new_page) {
3861e8f889bSDavid Gibson 		page_cache_release(old_page);
3870df420d8SChristoph Lameter 		return VM_FAULT_OOM;
3881e8f889bSDavid Gibson 	}
3891e8f889bSDavid Gibson 
3901e8f889bSDavid Gibson 	spin_unlock(&mm->page_table_lock);
3911e8f889bSDavid Gibson 	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
3921e8f889bSDavid Gibson 		copy_user_highpage(new_page + i, old_page + i,
3931e8f889bSDavid Gibson 				   address + i*PAGE_SIZE);
3941e8f889bSDavid Gibson 	spin_lock(&mm->page_table_lock);
3951e8f889bSDavid Gibson 
3961e8f889bSDavid Gibson 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
3971e8f889bSDavid Gibson 	if (likely(pte_same(*ptep, pte))) {
3981e8f889bSDavid Gibson 		/* Break COW */
3991e8f889bSDavid Gibson 		set_huge_pte_at(mm, address, ptep,
4001e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
4011e8f889bSDavid Gibson 		/* Make the old page be freed below */
4021e8f889bSDavid Gibson 		new_page = old_page;
4031e8f889bSDavid Gibson 	}
4041e8f889bSDavid Gibson 	page_cache_release(new_page);
4051e8f889bSDavid Gibson 	page_cache_release(old_page);
4061e8f889bSDavid Gibson 	return VM_FAULT_MINOR;
4071e8f889bSDavid Gibson }
4081e8f889bSDavid Gibson 
40986e5216fSAdam Litke int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
4101e8f889bSDavid Gibson 			unsigned long address, pte_t *ptep, int write_access)
411ac9b9c66SHugh Dickins {
412ac9b9c66SHugh Dickins 	int ret = VM_FAULT_SIGBUS;
4134c887265SAdam Litke 	unsigned long idx;
4144c887265SAdam Litke 	unsigned long size;
4154c887265SAdam Litke 	struct page *page;
4164c887265SAdam Litke 	struct address_space *mapping;
4171e8f889bSDavid Gibson 	pte_t new_pte;
4184c887265SAdam Litke 
4194c887265SAdam Litke 	mapping = vma->vm_file->f_mapping;
4204c887265SAdam Litke 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
4214c887265SAdam Litke 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
4224c887265SAdam Litke 
4234c887265SAdam Litke 	/*
4244c887265SAdam Litke 	 * Use page lock to guard against racing truncation
4254c887265SAdam Litke 	 * before we get page_table_lock.
4264c887265SAdam Litke 	 */
4276bda666aSChristoph Lameter retry:
4286bda666aSChristoph Lameter 	page = find_lock_page(mapping, idx);
4296bda666aSChristoph Lameter 	if (!page) {
4306bda666aSChristoph Lameter 		if (hugetlb_get_quota(mapping))
4314c887265SAdam Litke 			goto out;
4326bda666aSChristoph Lameter 		page = alloc_huge_page(vma, address);
4336bda666aSChristoph Lameter 		if (!page) {
4346bda666aSChristoph Lameter 			hugetlb_put_quota(mapping);
4350df420d8SChristoph Lameter 			ret = VM_FAULT_OOM;
4366bda666aSChristoph Lameter 			goto out;
4376bda666aSChristoph Lameter 		}
438ac9b9c66SHugh Dickins 
4396bda666aSChristoph Lameter 		if (vma->vm_flags & VM_SHARED) {
4406bda666aSChristoph Lameter 			int err;
4416bda666aSChristoph Lameter 
4426bda666aSChristoph Lameter 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
4436bda666aSChristoph Lameter 			if (err) {
4446bda666aSChristoph Lameter 				put_page(page);
4456bda666aSChristoph Lameter 				hugetlb_put_quota(mapping);
4466bda666aSChristoph Lameter 				if (err == -EEXIST)
4476bda666aSChristoph Lameter 					goto retry;
4486bda666aSChristoph Lameter 				goto out;
4496bda666aSChristoph Lameter 			}
4506bda666aSChristoph Lameter 		} else
4516bda666aSChristoph Lameter 			lock_page(page);
4526bda666aSChristoph Lameter 	}
4531e8f889bSDavid Gibson 
454ac9b9c66SHugh Dickins 	spin_lock(&mm->page_table_lock);
4554c887265SAdam Litke 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
4564c887265SAdam Litke 	if (idx >= size)
4574c887265SAdam Litke 		goto backout;
4584c887265SAdam Litke 
459ac9b9c66SHugh Dickins 	ret = VM_FAULT_MINOR;
46086e5216fSAdam Litke 	if (!pte_none(*ptep))
4614c887265SAdam Litke 		goto backout;
4624c887265SAdam Litke 
4634c887265SAdam Litke 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
4641e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
4651e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
4661e8f889bSDavid Gibson 	set_huge_pte_at(mm, address, ptep, new_pte);
4671e8f889bSDavid Gibson 
4681e8f889bSDavid Gibson 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
4691e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
4701e8f889bSDavid Gibson 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
4711e8f889bSDavid Gibson 	}
4721e8f889bSDavid Gibson 
473ac9b9c66SHugh Dickins 	spin_unlock(&mm->page_table_lock);
4744c887265SAdam Litke 	unlock_page(page);
4754c887265SAdam Litke out:
476ac9b9c66SHugh Dickins 	return ret;
4774c887265SAdam Litke 
4784c887265SAdam Litke backout:
4794c887265SAdam Litke 	spin_unlock(&mm->page_table_lock);
4804c887265SAdam Litke 	hugetlb_put_quota(mapping);
4814c887265SAdam Litke 	unlock_page(page);
4824c887265SAdam Litke 	put_page(page);
4834c887265SAdam Litke 	goto out;
484ac9b9c66SHugh Dickins }
485ac9b9c66SHugh Dickins 
48686e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
48786e5216fSAdam Litke 			unsigned long address, int write_access)
48886e5216fSAdam Litke {
48986e5216fSAdam Litke 	pte_t *ptep;
49086e5216fSAdam Litke 	pte_t entry;
4911e8f889bSDavid Gibson 	int ret;
49286e5216fSAdam Litke 
49386e5216fSAdam Litke 	ptep = huge_pte_alloc(mm, address);
49486e5216fSAdam Litke 	if (!ptep)
49586e5216fSAdam Litke 		return VM_FAULT_OOM;
49686e5216fSAdam Litke 
49786e5216fSAdam Litke 	entry = *ptep;
49886e5216fSAdam Litke 	if (pte_none(entry))
4991e8f889bSDavid Gibson 		return hugetlb_no_page(mm, vma, address, ptep, write_access);
50086e5216fSAdam Litke 
5011e8f889bSDavid Gibson 	ret = VM_FAULT_MINOR;
5021e8f889bSDavid Gibson 
5031e8f889bSDavid Gibson 	spin_lock(&mm->page_table_lock);
5041e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
5051e8f889bSDavid Gibson 	if (likely(pte_same(entry, *ptep)))
5061e8f889bSDavid Gibson 		if (write_access && !pte_write(entry))
5071e8f889bSDavid Gibson 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
5081e8f889bSDavid Gibson 	spin_unlock(&mm->page_table_lock);
5091e8f889bSDavid Gibson 
5101e8f889bSDavid Gibson 	return ret;
51186e5216fSAdam Litke }
51286e5216fSAdam Litke 
51363551ae0SDavid Gibson int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
51463551ae0SDavid Gibson 			struct page **pages, struct vm_area_struct **vmas,
51563551ae0SDavid Gibson 			unsigned long *position, int *length, int i)
51663551ae0SDavid Gibson {
51763551ae0SDavid Gibson 	unsigned long vpfn, vaddr = *position;
51863551ae0SDavid Gibson 	int remainder = *length;
51963551ae0SDavid Gibson 
52063551ae0SDavid Gibson 	vpfn = vaddr/PAGE_SIZE;
5211c59827dSHugh Dickins 	spin_lock(&mm->page_table_lock);
52263551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
52363551ae0SDavid Gibson 		pte_t *pte;
52463551ae0SDavid Gibson 		struct page *page;
52563551ae0SDavid Gibson 
5264c887265SAdam Litke 		/*
5274c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
5284c887265SAdam Litke 		 * each hugepage.  We have to make * sure we get the
5294c887265SAdam Litke 		 * first, for the page indexing below to work.
5304c887265SAdam Litke 		 */
53163551ae0SDavid Gibson 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
53263551ae0SDavid Gibson 
5331c59827dSHugh Dickins 		if (!pte || pte_none(*pte)) {
5344c887265SAdam Litke 			int ret;
5354c887265SAdam Litke 
5364c887265SAdam Litke 			spin_unlock(&mm->page_table_lock);
5374c887265SAdam Litke 			ret = hugetlb_fault(mm, vma, vaddr, 0);
5384c887265SAdam Litke 			spin_lock(&mm->page_table_lock);
5394c887265SAdam Litke 			if (ret == VM_FAULT_MINOR)
5404c887265SAdam Litke 				continue;
5414c887265SAdam Litke 
5421c59827dSHugh Dickins 			remainder = 0;
5431c59827dSHugh Dickins 			if (!i)
5441c59827dSHugh Dickins 				i = -EFAULT;
5451c59827dSHugh Dickins 			break;
5461c59827dSHugh Dickins 		}
54763551ae0SDavid Gibson 
5484c887265SAdam Litke 		if (pages) {
54963551ae0SDavid Gibson 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
55063551ae0SDavid Gibson 			get_page(page);
55163551ae0SDavid Gibson 			pages[i] = page;
55263551ae0SDavid Gibson 		}
55363551ae0SDavid Gibson 
55463551ae0SDavid Gibson 		if (vmas)
55563551ae0SDavid Gibson 			vmas[i] = vma;
55663551ae0SDavid Gibson 
55763551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
55863551ae0SDavid Gibson 		++vpfn;
55963551ae0SDavid Gibson 		--remainder;
56063551ae0SDavid Gibson 		++i;
56163551ae0SDavid Gibson 	}
5621c59827dSHugh Dickins 	spin_unlock(&mm->page_table_lock);
56363551ae0SDavid Gibson 	*length = remainder;
56463551ae0SDavid Gibson 	*position = vaddr;
56563551ae0SDavid Gibson 
56663551ae0SDavid Gibson 	return i;
56763551ae0SDavid Gibson }
5688f860591SZhang, Yanmin 
5698f860591SZhang, Yanmin void hugetlb_change_protection(struct vm_area_struct *vma,
5708f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
5718f860591SZhang, Yanmin {
5728f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
5738f860591SZhang, Yanmin 	unsigned long start = address;
5748f860591SZhang, Yanmin 	pte_t *ptep;
5758f860591SZhang, Yanmin 	pte_t pte;
5768f860591SZhang, Yanmin 
5778f860591SZhang, Yanmin 	BUG_ON(address >= end);
5788f860591SZhang, Yanmin 	flush_cache_range(vma, address, end);
5798f860591SZhang, Yanmin 
5808f860591SZhang, Yanmin 	spin_lock(&mm->page_table_lock);
5818f860591SZhang, Yanmin 	for (; address < end; address += HPAGE_SIZE) {
5828f860591SZhang, Yanmin 		ptep = huge_pte_offset(mm, address);
5838f860591SZhang, Yanmin 		if (!ptep)
5848f860591SZhang, Yanmin 			continue;
5858f860591SZhang, Yanmin 		if (!pte_none(*ptep)) {
5868f860591SZhang, Yanmin 			pte = huge_ptep_get_and_clear(mm, address, ptep);
5878f860591SZhang, Yanmin 			pte = pte_mkhuge(pte_modify(pte, newprot));
5888f860591SZhang, Yanmin 			set_huge_pte_at(mm, address, ptep, pte);
5898f860591SZhang, Yanmin 			lazy_mmu_prot_update(pte);
5908f860591SZhang, Yanmin 		}
5918f860591SZhang, Yanmin 	}
5928f860591SZhang, Yanmin 	spin_unlock(&mm->page_table_lock);
5938f860591SZhang, Yanmin 
5948f860591SZhang, Yanmin 	flush_tlb_range(vma, start, end);
5958f860591SZhang, Yanmin }
5968f860591SZhang, Yanmin 
597