11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Generic hugetlb support. 31da177e4SLinus Torvalds * (C) William Irwin, April 2004 41da177e4SLinus Torvalds */ 51da177e4SLinus Torvalds #include <linux/gfp.h> 61da177e4SLinus Torvalds #include <linux/list.h> 71da177e4SLinus Torvalds #include <linux/init.h> 81da177e4SLinus Torvalds #include <linux/module.h> 91da177e4SLinus Torvalds #include <linux/mm.h> 101da177e4SLinus Torvalds #include <linux/sysctl.h> 111da177e4SLinus Torvalds #include <linux/highmem.h> 121da177e4SLinus Torvalds #include <linux/nodemask.h> 1363551ae0SDavid Gibson #include <linux/pagemap.h> 145da7ca86SChristoph Lameter #include <linux/mempolicy.h> 15aea47ff3SChristoph Lameter #include <linux/cpuset.h> 165da7ca86SChristoph Lameter 1763551ae0SDavid Gibson #include <asm/page.h> 1863551ae0SDavid Gibson #include <asm/pgtable.h> 1963551ae0SDavid Gibson 2063551ae0SDavid Gibson #include <linux/hugetlb.h> 217835e98bSNick Piggin #include "internal.h" 221da177e4SLinus Torvalds 231da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 241da177e4SLinus Torvalds static unsigned long nr_huge_pages, free_huge_pages; 251da177e4SLinus Torvalds unsigned long max_huge_pages; 261da177e4SLinus Torvalds static struct list_head hugepage_freelists[MAX_NUMNODES]; 271da177e4SLinus Torvalds static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 281da177e4SLinus Torvalds static unsigned int free_huge_pages_node[MAX_NUMNODES]; 290bd0f9fbSEric Paris 300bd0f9fbSEric Paris /* 310bd0f9fbSEric Paris * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 320bd0f9fbSEric Paris */ 331da177e4SLinus Torvalds static DEFINE_SPINLOCK(hugetlb_lock); 341da177e4SLinus Torvalds 351da177e4SLinus Torvalds static void enqueue_huge_page(struct page *page) 361da177e4SLinus Torvalds { 371da177e4SLinus Torvalds int nid = page_to_nid(page); 381da177e4SLinus Torvalds list_add(&page->lru, &hugepage_freelists[nid]); 391da177e4SLinus Torvalds free_huge_pages++; 401da177e4SLinus Torvalds free_huge_pages_node[nid]++; 411da177e4SLinus Torvalds } 421da177e4SLinus Torvalds 435da7ca86SChristoph Lameter static struct page *dequeue_huge_page(struct vm_area_struct *vma, 445da7ca86SChristoph Lameter unsigned long address) 451da177e4SLinus Torvalds { 461da177e4SLinus Torvalds int nid = numa_node_id(); 471da177e4SLinus Torvalds struct page *page = NULL; 485da7ca86SChristoph Lameter struct zonelist *zonelist = huge_zonelist(vma, address); 4996df9333SChristoph Lameter struct zone **z; 501da177e4SLinus Torvalds 5196df9333SChristoph Lameter for (z = zonelist->zones; *z; z++) { 5296df9333SChristoph Lameter nid = (*z)->zone_pgdat->node_id; 53aea47ff3SChristoph Lameter if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 54aea47ff3SChristoph Lameter !list_empty(&hugepage_freelists[nid])) 551da177e4SLinus Torvalds break; 561da177e4SLinus Torvalds } 5796df9333SChristoph Lameter 5896df9333SChristoph Lameter if (*z) { 591da177e4SLinus Torvalds page = list_entry(hugepage_freelists[nid].next, 601da177e4SLinus Torvalds struct page, lru); 611da177e4SLinus Torvalds list_del(&page->lru); 621da177e4SLinus Torvalds free_huge_pages--; 631da177e4SLinus Torvalds free_huge_pages_node[nid]--; 641da177e4SLinus Torvalds } 651da177e4SLinus Torvalds return page; 661da177e4SLinus Torvalds } 671da177e4SLinus Torvalds 68a482289dSNick Piggin static int alloc_fresh_huge_page(void) 691da177e4SLinus Torvalds { 701da177e4SLinus Torvalds static int nid = 0; 711da177e4SLinus Torvalds struct page *page; 721da177e4SLinus Torvalds page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 731da177e4SLinus Torvalds HUGETLB_PAGE_ORDER); 741da177e4SLinus Torvalds nid = (nid + 1) % num_online_nodes(); 751da177e4SLinus Torvalds if (page) { 76a482289dSNick Piggin page[1].lru.next = (void *)free_huge_page; /* dtor */ 770bd0f9fbSEric Paris spin_lock(&hugetlb_lock); 781da177e4SLinus Torvalds nr_huge_pages++; 791da177e4SLinus Torvalds nr_huge_pages_node[page_to_nid(page)]++; 800bd0f9fbSEric Paris spin_unlock(&hugetlb_lock); 81a482289dSNick Piggin put_page(page); /* free it into the hugepage allocator */ 82a482289dSNick Piggin return 1; 831da177e4SLinus Torvalds } 84a482289dSNick Piggin return 0; 851da177e4SLinus Torvalds } 861da177e4SLinus Torvalds 871da177e4SLinus Torvalds void free_huge_page(struct page *page) 881da177e4SLinus Torvalds { 891da177e4SLinus Torvalds BUG_ON(page_count(page)); 901da177e4SLinus Torvalds 911da177e4SLinus Torvalds INIT_LIST_HEAD(&page->lru); 921da177e4SLinus Torvalds 931da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 941da177e4SLinus Torvalds enqueue_huge_page(page); 951da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 961da177e4SLinus Torvalds } 971da177e4SLinus Torvalds 985da7ca86SChristoph Lameter struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 991da177e4SLinus Torvalds { 1001da177e4SLinus Torvalds struct page *page; 1011da177e4SLinus Torvalds int i; 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 1045da7ca86SChristoph Lameter page = dequeue_huge_page(vma, addr); 1051da177e4SLinus Torvalds if (!page) { 1061da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 1071da177e4SLinus Torvalds return NULL; 1081da177e4SLinus Torvalds } 1091da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 1107835e98bSNick Piggin set_page_refcounted(page); 1111da177e4SLinus Torvalds for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 112a2dfef69SDavid Gibson clear_user_highpage(&page[i], addr); 1131da177e4SLinus Torvalds return page; 1141da177e4SLinus Torvalds } 1151da177e4SLinus Torvalds 1161da177e4SLinus Torvalds static int __init hugetlb_init(void) 1171da177e4SLinus Torvalds { 1181da177e4SLinus Torvalds unsigned long i; 1191da177e4SLinus Torvalds 1203c726f8dSBenjamin Herrenschmidt if (HPAGE_SHIFT == 0) 1213c726f8dSBenjamin Herrenschmidt return 0; 1223c726f8dSBenjamin Herrenschmidt 1231da177e4SLinus Torvalds for (i = 0; i < MAX_NUMNODES; ++i) 1241da177e4SLinus Torvalds INIT_LIST_HEAD(&hugepage_freelists[i]); 1251da177e4SLinus Torvalds 1261da177e4SLinus Torvalds for (i = 0; i < max_huge_pages; ++i) { 127a482289dSNick Piggin if (!alloc_fresh_huge_page()) 1281da177e4SLinus Torvalds break; 1291da177e4SLinus Torvalds } 1301da177e4SLinus Torvalds max_huge_pages = free_huge_pages = nr_huge_pages = i; 1311da177e4SLinus Torvalds printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 1321da177e4SLinus Torvalds return 0; 1331da177e4SLinus Torvalds } 1341da177e4SLinus Torvalds module_init(hugetlb_init); 1351da177e4SLinus Torvalds 1361da177e4SLinus Torvalds static int __init hugetlb_setup(char *s) 1371da177e4SLinus Torvalds { 1381da177e4SLinus Torvalds if (sscanf(s, "%lu", &max_huge_pages) <= 0) 1391da177e4SLinus Torvalds max_huge_pages = 0; 1401da177e4SLinus Torvalds return 1; 1411da177e4SLinus Torvalds } 1421da177e4SLinus Torvalds __setup("hugepages=", hugetlb_setup); 1431da177e4SLinus Torvalds 1441da177e4SLinus Torvalds #ifdef CONFIG_SYSCTL 1451da177e4SLinus Torvalds static void update_and_free_page(struct page *page) 1461da177e4SLinus Torvalds { 1471da177e4SLinus Torvalds int i; 1481da177e4SLinus Torvalds nr_huge_pages--; 1491da177e4SLinus Torvalds nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 1501da177e4SLinus Torvalds for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 1511da177e4SLinus Torvalds page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1521da177e4SLinus Torvalds 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1531da177e4SLinus Torvalds 1 << PG_private | 1<< PG_writeback); 1541da177e4SLinus Torvalds } 155a482289dSNick Piggin page[1].lru.next = NULL; 1567835e98bSNick Piggin set_page_refcounted(page); 1571da177e4SLinus Torvalds __free_pages(page, HUGETLB_PAGE_ORDER); 1581da177e4SLinus Torvalds } 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM 1611da177e4SLinus Torvalds static void try_to_free_low(unsigned long count) 1621da177e4SLinus Torvalds { 1631da177e4SLinus Torvalds int i, nid; 1641da177e4SLinus Torvalds for (i = 0; i < MAX_NUMNODES; ++i) { 1651da177e4SLinus Torvalds struct page *page, *next; 1661da177e4SLinus Torvalds list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1671da177e4SLinus Torvalds if (PageHighMem(page)) 1681da177e4SLinus Torvalds continue; 1691da177e4SLinus Torvalds list_del(&page->lru); 1701da177e4SLinus Torvalds update_and_free_page(page); 1711da177e4SLinus Torvalds nid = page_zone(page)->zone_pgdat->node_id; 1721da177e4SLinus Torvalds free_huge_pages--; 1731da177e4SLinus Torvalds free_huge_pages_node[nid]--; 1741da177e4SLinus Torvalds if (count >= nr_huge_pages) 1751da177e4SLinus Torvalds return; 1761da177e4SLinus Torvalds } 1771da177e4SLinus Torvalds } 1781da177e4SLinus Torvalds } 1791da177e4SLinus Torvalds #else 1801da177e4SLinus Torvalds static inline void try_to_free_low(unsigned long count) 1811da177e4SLinus Torvalds { 1821da177e4SLinus Torvalds } 1831da177e4SLinus Torvalds #endif 1841da177e4SLinus Torvalds 1851da177e4SLinus Torvalds static unsigned long set_max_huge_pages(unsigned long count) 1861da177e4SLinus Torvalds { 1871da177e4SLinus Torvalds while (count > nr_huge_pages) { 188a482289dSNick Piggin if (!alloc_fresh_huge_page()) 1891da177e4SLinus Torvalds return nr_huge_pages; 1901da177e4SLinus Torvalds } 1911da177e4SLinus Torvalds if (count >= nr_huge_pages) 1921da177e4SLinus Torvalds return nr_huge_pages; 1931da177e4SLinus Torvalds 1941da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 1951da177e4SLinus Torvalds try_to_free_low(count); 1961da177e4SLinus Torvalds while (count < nr_huge_pages) { 1975da7ca86SChristoph Lameter struct page *page = dequeue_huge_page(NULL, 0); 1981da177e4SLinus Torvalds if (!page) 1991da177e4SLinus Torvalds break; 2001da177e4SLinus Torvalds update_and_free_page(page); 2011da177e4SLinus Torvalds } 2021da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 2031da177e4SLinus Torvalds return nr_huge_pages; 2041da177e4SLinus Torvalds } 2051da177e4SLinus Torvalds 2061da177e4SLinus Torvalds int hugetlb_sysctl_handler(struct ctl_table *table, int write, 2071da177e4SLinus Torvalds struct file *file, void __user *buffer, 2081da177e4SLinus Torvalds size_t *length, loff_t *ppos) 2091da177e4SLinus Torvalds { 2101da177e4SLinus Torvalds proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2111da177e4SLinus Torvalds max_huge_pages = set_max_huge_pages(max_huge_pages); 2121da177e4SLinus Torvalds return 0; 2131da177e4SLinus Torvalds } 2141da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */ 2151da177e4SLinus Torvalds 2161da177e4SLinus Torvalds int hugetlb_report_meminfo(char *buf) 2171da177e4SLinus Torvalds { 2181da177e4SLinus Torvalds return sprintf(buf, 2191da177e4SLinus Torvalds "HugePages_Total: %5lu\n" 2201da177e4SLinus Torvalds "HugePages_Free: %5lu\n" 2211da177e4SLinus Torvalds "Hugepagesize: %5lu kB\n", 2221da177e4SLinus Torvalds nr_huge_pages, 2231da177e4SLinus Torvalds free_huge_pages, 2241da177e4SLinus Torvalds HPAGE_SIZE/1024); 2251da177e4SLinus Torvalds } 2261da177e4SLinus Torvalds 2271da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf) 2281da177e4SLinus Torvalds { 2291da177e4SLinus Torvalds return sprintf(buf, 2301da177e4SLinus Torvalds "Node %d HugePages_Total: %5u\n" 2311da177e4SLinus Torvalds "Node %d HugePages_Free: %5u\n", 2321da177e4SLinus Torvalds nid, nr_huge_pages_node[nid], 2331da177e4SLinus Torvalds nid, free_huge_pages_node[nid]); 2341da177e4SLinus Torvalds } 2351da177e4SLinus Torvalds 2361da177e4SLinus Torvalds int is_hugepage_mem_enough(size_t size) 2371da177e4SLinus Torvalds { 2381da177e4SLinus Torvalds return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; 2391da177e4SLinus Torvalds } 2401da177e4SLinus Torvalds 2411da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2421da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void) 2431da177e4SLinus Torvalds { 2441da177e4SLinus Torvalds return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 2451da177e4SLinus Torvalds } 2461da177e4SLinus Torvalds 2471da177e4SLinus Torvalds /* 2481da177e4SLinus Torvalds * We cannot handle pagefaults against hugetlb pages at all. They cause 2491da177e4SLinus Torvalds * handle_mm_fault() to try to instantiate regular-sized pages in the 2501da177e4SLinus Torvalds * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 2511da177e4SLinus Torvalds * this far. 2521da177e4SLinus Torvalds */ 2531da177e4SLinus Torvalds static struct page *hugetlb_nopage(struct vm_area_struct *vma, 2541da177e4SLinus Torvalds unsigned long address, int *unused) 2551da177e4SLinus Torvalds { 2561da177e4SLinus Torvalds BUG(); 2571da177e4SLinus Torvalds return NULL; 2581da177e4SLinus Torvalds } 2591da177e4SLinus Torvalds 2601da177e4SLinus Torvalds struct vm_operations_struct hugetlb_vm_ops = { 2611da177e4SLinus Torvalds .nopage = hugetlb_nopage, 2621da177e4SLinus Torvalds }; 2631da177e4SLinus Torvalds 2641e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 2651e8f889bSDavid Gibson int writable) 26663551ae0SDavid Gibson { 26763551ae0SDavid Gibson pte_t entry; 26863551ae0SDavid Gibson 2691e8f889bSDavid Gibson if (writable) { 27063551ae0SDavid Gibson entry = 27163551ae0SDavid Gibson pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 27263551ae0SDavid Gibson } else { 27363551ae0SDavid Gibson entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 27463551ae0SDavid Gibson } 27563551ae0SDavid Gibson entry = pte_mkyoung(entry); 27663551ae0SDavid Gibson entry = pte_mkhuge(entry); 27763551ae0SDavid Gibson 27863551ae0SDavid Gibson return entry; 27963551ae0SDavid Gibson } 28063551ae0SDavid Gibson 2811e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma, 2821e8f889bSDavid Gibson unsigned long address, pte_t *ptep) 2831e8f889bSDavid Gibson { 2841e8f889bSDavid Gibson pte_t entry; 2851e8f889bSDavid Gibson 2861e8f889bSDavid Gibson entry = pte_mkwrite(pte_mkdirty(*ptep)); 2871e8f889bSDavid Gibson ptep_set_access_flags(vma, address, ptep, entry, 1); 2881e8f889bSDavid Gibson update_mmu_cache(vma, address, entry); 2891e8f889bSDavid Gibson lazy_mmu_prot_update(entry); 2901e8f889bSDavid Gibson } 2911e8f889bSDavid Gibson 2921e8f889bSDavid Gibson 29363551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 29463551ae0SDavid Gibson struct vm_area_struct *vma) 29563551ae0SDavid Gibson { 29663551ae0SDavid Gibson pte_t *src_pte, *dst_pte, entry; 29763551ae0SDavid Gibson struct page *ptepage; 2981c59827dSHugh Dickins unsigned long addr; 2991e8f889bSDavid Gibson int cow; 3001e8f889bSDavid Gibson 3011e8f889bSDavid Gibson cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 30263551ae0SDavid Gibson 3031c59827dSHugh Dickins for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 304c74df32cSHugh Dickins src_pte = huge_pte_offset(src, addr); 305c74df32cSHugh Dickins if (!src_pte) 306c74df32cSHugh Dickins continue; 30763551ae0SDavid Gibson dst_pte = huge_pte_alloc(dst, addr); 30863551ae0SDavid Gibson if (!dst_pte) 30963551ae0SDavid Gibson goto nomem; 310c74df32cSHugh Dickins spin_lock(&dst->page_table_lock); 3111c59827dSHugh Dickins spin_lock(&src->page_table_lock); 312c74df32cSHugh Dickins if (!pte_none(*src_pte)) { 3131e8f889bSDavid Gibson if (cow) 3141e8f889bSDavid Gibson ptep_set_wrprotect(src, addr, src_pte); 31563551ae0SDavid Gibson entry = *src_pte; 31663551ae0SDavid Gibson ptepage = pte_page(entry); 31763551ae0SDavid Gibson get_page(ptepage); 3184294621fSHugh Dickins add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); 31963551ae0SDavid Gibson set_huge_pte_at(dst, addr, dst_pte, entry); 3201c59827dSHugh Dickins } 3211c59827dSHugh Dickins spin_unlock(&src->page_table_lock); 322c74df32cSHugh Dickins spin_unlock(&dst->page_table_lock); 32363551ae0SDavid Gibson } 32463551ae0SDavid Gibson return 0; 32563551ae0SDavid Gibson 32663551ae0SDavid Gibson nomem: 32763551ae0SDavid Gibson return -ENOMEM; 32863551ae0SDavid Gibson } 32963551ae0SDavid Gibson 33063551ae0SDavid Gibson void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 33163551ae0SDavid Gibson unsigned long end) 33263551ae0SDavid Gibson { 33363551ae0SDavid Gibson struct mm_struct *mm = vma->vm_mm; 33463551ae0SDavid Gibson unsigned long address; 335c7546f8fSDavid Gibson pte_t *ptep; 33663551ae0SDavid Gibson pte_t pte; 33763551ae0SDavid Gibson struct page *page; 33863551ae0SDavid Gibson 33963551ae0SDavid Gibson WARN_ON(!is_vm_hugetlb_page(vma)); 34063551ae0SDavid Gibson BUG_ON(start & ~HPAGE_MASK); 34163551ae0SDavid Gibson BUG_ON(end & ~HPAGE_MASK); 34263551ae0SDavid Gibson 343508034a3SHugh Dickins spin_lock(&mm->page_table_lock); 344508034a3SHugh Dickins 345365e9c87SHugh Dickins /* Update high watermark before we lower rss */ 346365e9c87SHugh Dickins update_hiwater_rss(mm); 347365e9c87SHugh Dickins 34863551ae0SDavid Gibson for (address = start; address < end; address += HPAGE_SIZE) { 349c7546f8fSDavid Gibson ptep = huge_pte_offset(mm, address); 350c7546f8fSDavid Gibson if (!ptep) 351c7546f8fSDavid Gibson continue; 352c7546f8fSDavid Gibson 353c7546f8fSDavid Gibson pte = huge_ptep_get_and_clear(mm, address, ptep); 35463551ae0SDavid Gibson if (pte_none(pte)) 35563551ae0SDavid Gibson continue; 356c7546f8fSDavid Gibson 35763551ae0SDavid Gibson page = pte_page(pte); 35863551ae0SDavid Gibson put_page(page); 3594294621fSHugh Dickins add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); 36063551ae0SDavid Gibson } 36163551ae0SDavid Gibson 3621da177e4SLinus Torvalds spin_unlock(&mm->page_table_lock); 363508034a3SHugh Dickins flush_tlb_range(vma, start, end); 3641da177e4SLinus Torvalds } 36563551ae0SDavid Gibson 3661e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 3671e8f889bSDavid Gibson unsigned long address, pte_t *ptep, pte_t pte) 3681e8f889bSDavid Gibson { 3691e8f889bSDavid Gibson struct page *old_page, *new_page; 3701e8f889bSDavid Gibson int i, avoidcopy; 3711e8f889bSDavid Gibson 3721e8f889bSDavid Gibson old_page = pte_page(pte); 3731e8f889bSDavid Gibson 3741e8f889bSDavid Gibson /* If no-one else is actually using this page, avoid the copy 3751e8f889bSDavid Gibson * and just make the page writable */ 3761e8f889bSDavid Gibson avoidcopy = (page_count(old_page) == 1); 3771e8f889bSDavid Gibson if (avoidcopy) { 3781e8f889bSDavid Gibson set_huge_ptep_writable(vma, address, ptep); 3791e8f889bSDavid Gibson return VM_FAULT_MINOR; 3801e8f889bSDavid Gibson } 3811e8f889bSDavid Gibson 3821e8f889bSDavid Gibson page_cache_get(old_page); 3835da7ca86SChristoph Lameter new_page = alloc_huge_page(vma, address); 3841e8f889bSDavid Gibson 3851e8f889bSDavid Gibson if (!new_page) { 3861e8f889bSDavid Gibson page_cache_release(old_page); 3870df420d8SChristoph Lameter return VM_FAULT_OOM; 3881e8f889bSDavid Gibson } 3891e8f889bSDavid Gibson 3901e8f889bSDavid Gibson spin_unlock(&mm->page_table_lock); 3911e8f889bSDavid Gibson for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) 3921e8f889bSDavid Gibson copy_user_highpage(new_page + i, old_page + i, 3931e8f889bSDavid Gibson address + i*PAGE_SIZE); 3941e8f889bSDavid Gibson spin_lock(&mm->page_table_lock); 3951e8f889bSDavid Gibson 3961e8f889bSDavid Gibson ptep = huge_pte_offset(mm, address & HPAGE_MASK); 3971e8f889bSDavid Gibson if (likely(pte_same(*ptep, pte))) { 3981e8f889bSDavid Gibson /* Break COW */ 3991e8f889bSDavid Gibson set_huge_pte_at(mm, address, ptep, 4001e8f889bSDavid Gibson make_huge_pte(vma, new_page, 1)); 4011e8f889bSDavid Gibson /* Make the old page be freed below */ 4021e8f889bSDavid Gibson new_page = old_page; 4031e8f889bSDavid Gibson } 4041e8f889bSDavid Gibson page_cache_release(new_page); 4051e8f889bSDavid Gibson page_cache_release(old_page); 4061e8f889bSDavid Gibson return VM_FAULT_MINOR; 4071e8f889bSDavid Gibson } 4081e8f889bSDavid Gibson 40986e5216fSAdam Litke int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 4101e8f889bSDavid Gibson unsigned long address, pte_t *ptep, int write_access) 411ac9b9c66SHugh Dickins { 412ac9b9c66SHugh Dickins int ret = VM_FAULT_SIGBUS; 4134c887265SAdam Litke unsigned long idx; 4144c887265SAdam Litke unsigned long size; 4154c887265SAdam Litke struct page *page; 4164c887265SAdam Litke struct address_space *mapping; 4171e8f889bSDavid Gibson pte_t new_pte; 4184c887265SAdam Litke 4194c887265SAdam Litke mapping = vma->vm_file->f_mapping; 4204c887265SAdam Litke idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 4214c887265SAdam Litke + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 4224c887265SAdam Litke 4234c887265SAdam Litke /* 4244c887265SAdam Litke * Use page lock to guard against racing truncation 4254c887265SAdam Litke * before we get page_table_lock. 4264c887265SAdam Litke */ 4276bda666aSChristoph Lameter retry: 4286bda666aSChristoph Lameter page = find_lock_page(mapping, idx); 4296bda666aSChristoph Lameter if (!page) { 4306bda666aSChristoph Lameter if (hugetlb_get_quota(mapping)) 4314c887265SAdam Litke goto out; 4326bda666aSChristoph Lameter page = alloc_huge_page(vma, address); 4336bda666aSChristoph Lameter if (!page) { 4346bda666aSChristoph Lameter hugetlb_put_quota(mapping); 4350df420d8SChristoph Lameter ret = VM_FAULT_OOM; 4366bda666aSChristoph Lameter goto out; 4376bda666aSChristoph Lameter } 438ac9b9c66SHugh Dickins 4396bda666aSChristoph Lameter if (vma->vm_flags & VM_SHARED) { 4406bda666aSChristoph Lameter int err; 4416bda666aSChristoph Lameter 4426bda666aSChristoph Lameter err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 4436bda666aSChristoph Lameter if (err) { 4446bda666aSChristoph Lameter put_page(page); 4456bda666aSChristoph Lameter hugetlb_put_quota(mapping); 4466bda666aSChristoph Lameter if (err == -EEXIST) 4476bda666aSChristoph Lameter goto retry; 4486bda666aSChristoph Lameter goto out; 4496bda666aSChristoph Lameter } 4506bda666aSChristoph Lameter } else 4516bda666aSChristoph Lameter lock_page(page); 4526bda666aSChristoph Lameter } 4531e8f889bSDavid Gibson 454ac9b9c66SHugh Dickins spin_lock(&mm->page_table_lock); 4554c887265SAdam Litke size = i_size_read(mapping->host) >> HPAGE_SHIFT; 4564c887265SAdam Litke if (idx >= size) 4574c887265SAdam Litke goto backout; 4584c887265SAdam Litke 459ac9b9c66SHugh Dickins ret = VM_FAULT_MINOR; 46086e5216fSAdam Litke if (!pte_none(*ptep)) 4614c887265SAdam Litke goto backout; 4624c887265SAdam Litke 4634c887265SAdam Litke add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 4641e8f889bSDavid Gibson new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 4651e8f889bSDavid Gibson && (vma->vm_flags & VM_SHARED))); 4661e8f889bSDavid Gibson set_huge_pte_at(mm, address, ptep, new_pte); 4671e8f889bSDavid Gibson 4681e8f889bSDavid Gibson if (write_access && !(vma->vm_flags & VM_SHARED)) { 4691e8f889bSDavid Gibson /* Optimization, do the COW without a second fault */ 4701e8f889bSDavid Gibson ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 4711e8f889bSDavid Gibson } 4721e8f889bSDavid Gibson 473ac9b9c66SHugh Dickins spin_unlock(&mm->page_table_lock); 4744c887265SAdam Litke unlock_page(page); 4754c887265SAdam Litke out: 476ac9b9c66SHugh Dickins return ret; 4774c887265SAdam Litke 4784c887265SAdam Litke backout: 4794c887265SAdam Litke spin_unlock(&mm->page_table_lock); 4804c887265SAdam Litke hugetlb_put_quota(mapping); 4814c887265SAdam Litke unlock_page(page); 4824c887265SAdam Litke put_page(page); 4834c887265SAdam Litke goto out; 484ac9b9c66SHugh Dickins } 485ac9b9c66SHugh Dickins 48686e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 48786e5216fSAdam Litke unsigned long address, int write_access) 48886e5216fSAdam Litke { 48986e5216fSAdam Litke pte_t *ptep; 49086e5216fSAdam Litke pte_t entry; 4911e8f889bSDavid Gibson int ret; 49286e5216fSAdam Litke 49386e5216fSAdam Litke ptep = huge_pte_alloc(mm, address); 49486e5216fSAdam Litke if (!ptep) 49586e5216fSAdam Litke return VM_FAULT_OOM; 49686e5216fSAdam Litke 49786e5216fSAdam Litke entry = *ptep; 49886e5216fSAdam Litke if (pte_none(entry)) 4991e8f889bSDavid Gibson return hugetlb_no_page(mm, vma, address, ptep, write_access); 50086e5216fSAdam Litke 5011e8f889bSDavid Gibson ret = VM_FAULT_MINOR; 5021e8f889bSDavid Gibson 5031e8f889bSDavid Gibson spin_lock(&mm->page_table_lock); 5041e8f889bSDavid Gibson /* Check for a racing update before calling hugetlb_cow */ 5051e8f889bSDavid Gibson if (likely(pte_same(entry, *ptep))) 5061e8f889bSDavid Gibson if (write_access && !pte_write(entry)) 5071e8f889bSDavid Gibson ret = hugetlb_cow(mm, vma, address, ptep, entry); 5081e8f889bSDavid Gibson spin_unlock(&mm->page_table_lock); 5091e8f889bSDavid Gibson 5101e8f889bSDavid Gibson return ret; 51186e5216fSAdam Litke } 51286e5216fSAdam Litke 51363551ae0SDavid Gibson int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 51463551ae0SDavid Gibson struct page **pages, struct vm_area_struct **vmas, 51563551ae0SDavid Gibson unsigned long *position, int *length, int i) 51663551ae0SDavid Gibson { 51763551ae0SDavid Gibson unsigned long vpfn, vaddr = *position; 51863551ae0SDavid Gibson int remainder = *length; 51963551ae0SDavid Gibson 52063551ae0SDavid Gibson vpfn = vaddr/PAGE_SIZE; 5211c59827dSHugh Dickins spin_lock(&mm->page_table_lock); 52263551ae0SDavid Gibson while (vaddr < vma->vm_end && remainder) { 52363551ae0SDavid Gibson pte_t *pte; 52463551ae0SDavid Gibson struct page *page; 52563551ae0SDavid Gibson 5264c887265SAdam Litke /* 5274c887265SAdam Litke * Some archs (sparc64, sh*) have multiple pte_ts to 5284c887265SAdam Litke * each hugepage. We have to make * sure we get the 5294c887265SAdam Litke * first, for the page indexing below to work. 5304c887265SAdam Litke */ 53163551ae0SDavid Gibson pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 53263551ae0SDavid Gibson 5331c59827dSHugh Dickins if (!pte || pte_none(*pte)) { 5344c887265SAdam Litke int ret; 5354c887265SAdam Litke 5364c887265SAdam Litke spin_unlock(&mm->page_table_lock); 5374c887265SAdam Litke ret = hugetlb_fault(mm, vma, vaddr, 0); 5384c887265SAdam Litke spin_lock(&mm->page_table_lock); 5394c887265SAdam Litke if (ret == VM_FAULT_MINOR) 5404c887265SAdam Litke continue; 5414c887265SAdam Litke 5421c59827dSHugh Dickins remainder = 0; 5431c59827dSHugh Dickins if (!i) 5441c59827dSHugh Dickins i = -EFAULT; 5451c59827dSHugh Dickins break; 5461c59827dSHugh Dickins } 54763551ae0SDavid Gibson 5484c887265SAdam Litke if (pages) { 54963551ae0SDavid Gibson page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 55063551ae0SDavid Gibson get_page(page); 55163551ae0SDavid Gibson pages[i] = page; 55263551ae0SDavid Gibson } 55363551ae0SDavid Gibson 55463551ae0SDavid Gibson if (vmas) 55563551ae0SDavid Gibson vmas[i] = vma; 55663551ae0SDavid Gibson 55763551ae0SDavid Gibson vaddr += PAGE_SIZE; 55863551ae0SDavid Gibson ++vpfn; 55963551ae0SDavid Gibson --remainder; 56063551ae0SDavid Gibson ++i; 56163551ae0SDavid Gibson } 5621c59827dSHugh Dickins spin_unlock(&mm->page_table_lock); 56363551ae0SDavid Gibson *length = remainder; 56463551ae0SDavid Gibson *position = vaddr; 56563551ae0SDavid Gibson 56663551ae0SDavid Gibson return i; 56763551ae0SDavid Gibson } 568