11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Generic hugetlb support. 31da177e4SLinus Torvalds * (C) William Irwin, April 2004 41da177e4SLinus Torvalds */ 51da177e4SLinus Torvalds #include <linux/gfp.h> 61da177e4SLinus Torvalds #include <linux/list.h> 71da177e4SLinus Torvalds #include <linux/init.h> 81da177e4SLinus Torvalds #include <linux/module.h> 91da177e4SLinus Torvalds #include <linux/mm.h> 101da177e4SLinus Torvalds #include <linux/sysctl.h> 111da177e4SLinus Torvalds #include <linux/highmem.h> 121da177e4SLinus Torvalds #include <linux/nodemask.h> 1363551ae0SDavid Gibson #include <linux/pagemap.h> 145da7ca86SChristoph Lameter #include <linux/mempolicy.h> 15aea47ff3SChristoph Lameter #include <linux/cpuset.h> 163935baa9SDavid Gibson #include <linux/mutex.h> 175da7ca86SChristoph Lameter 1863551ae0SDavid Gibson #include <asm/page.h> 1963551ae0SDavid Gibson #include <asm/pgtable.h> 2063551ae0SDavid Gibson 2163551ae0SDavid Gibson #include <linux/hugetlb.h> 227835e98bSNick Piggin #include "internal.h" 231da177e4SLinus Torvalds 241da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 251da177e4SLinus Torvalds static unsigned long nr_huge_pages, free_huge_pages; 261da177e4SLinus Torvalds unsigned long max_huge_pages; 271da177e4SLinus Torvalds static struct list_head hugepage_freelists[MAX_NUMNODES]; 281da177e4SLinus Torvalds static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 291da177e4SLinus Torvalds static unsigned int free_huge_pages_node[MAX_NUMNODES]; 303935baa9SDavid Gibson /* 313935baa9SDavid Gibson * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 323935baa9SDavid Gibson */ 333935baa9SDavid Gibson static DEFINE_SPINLOCK(hugetlb_lock); 340bd0f9fbSEric Paris 3579ac6ba4SDavid Gibson static void clear_huge_page(struct page *page, unsigned long addr) 3679ac6ba4SDavid Gibson { 3779ac6ba4SDavid Gibson int i; 3879ac6ba4SDavid Gibson 3979ac6ba4SDavid Gibson might_sleep(); 4079ac6ba4SDavid Gibson for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 4179ac6ba4SDavid Gibson cond_resched(); 4279ac6ba4SDavid Gibson clear_user_highpage(page + i, addr); 4379ac6ba4SDavid Gibson } 4479ac6ba4SDavid Gibson } 4579ac6ba4SDavid Gibson 4679ac6ba4SDavid Gibson static void copy_huge_page(struct page *dst, struct page *src, 4779ac6ba4SDavid Gibson unsigned long addr) 4879ac6ba4SDavid Gibson { 4979ac6ba4SDavid Gibson int i; 5079ac6ba4SDavid Gibson 5179ac6ba4SDavid Gibson might_sleep(); 5279ac6ba4SDavid Gibson for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 5379ac6ba4SDavid Gibson cond_resched(); 5479ac6ba4SDavid Gibson copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); 5579ac6ba4SDavid Gibson } 5679ac6ba4SDavid Gibson } 5779ac6ba4SDavid Gibson 581da177e4SLinus Torvalds static void enqueue_huge_page(struct page *page) 591da177e4SLinus Torvalds { 601da177e4SLinus Torvalds int nid = page_to_nid(page); 611da177e4SLinus Torvalds list_add(&page->lru, &hugepage_freelists[nid]); 621da177e4SLinus Torvalds free_huge_pages++; 631da177e4SLinus Torvalds free_huge_pages_node[nid]++; 641da177e4SLinus Torvalds } 651da177e4SLinus Torvalds 665da7ca86SChristoph Lameter static struct page *dequeue_huge_page(struct vm_area_struct *vma, 675da7ca86SChristoph Lameter unsigned long address) 681da177e4SLinus Torvalds { 691da177e4SLinus Torvalds int nid = numa_node_id(); 701da177e4SLinus Torvalds struct page *page = NULL; 715da7ca86SChristoph Lameter struct zonelist *zonelist = huge_zonelist(vma, address); 7296df9333SChristoph Lameter struct zone **z; 731da177e4SLinus Torvalds 7496df9333SChristoph Lameter for (z = zonelist->zones; *z; z++) { 7596df9333SChristoph Lameter nid = (*z)->zone_pgdat->node_id; 76aea47ff3SChristoph Lameter if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 77aea47ff3SChristoph Lameter !list_empty(&hugepage_freelists[nid])) 781da177e4SLinus Torvalds break; 791da177e4SLinus Torvalds } 8096df9333SChristoph Lameter 8196df9333SChristoph Lameter if (*z) { 821da177e4SLinus Torvalds page = list_entry(hugepage_freelists[nid].next, 831da177e4SLinus Torvalds struct page, lru); 841da177e4SLinus Torvalds list_del(&page->lru); 851da177e4SLinus Torvalds free_huge_pages--; 861da177e4SLinus Torvalds free_huge_pages_node[nid]--; 871da177e4SLinus Torvalds } 881da177e4SLinus Torvalds return page; 891da177e4SLinus Torvalds } 901da177e4SLinus Torvalds 91a482289dSNick Piggin static int alloc_fresh_huge_page(void) 921da177e4SLinus Torvalds { 931da177e4SLinus Torvalds static int nid = 0; 941da177e4SLinus Torvalds struct page *page; 951da177e4SLinus Torvalds page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 961da177e4SLinus Torvalds HUGETLB_PAGE_ORDER); 971da177e4SLinus Torvalds nid = (nid + 1) % num_online_nodes(); 981da177e4SLinus Torvalds if (page) { 99a482289dSNick Piggin page[1].lru.next = (void *)free_huge_page; /* dtor */ 1000bd0f9fbSEric Paris spin_lock(&hugetlb_lock); 1011da177e4SLinus Torvalds nr_huge_pages++; 1021da177e4SLinus Torvalds nr_huge_pages_node[page_to_nid(page)]++; 1030bd0f9fbSEric Paris spin_unlock(&hugetlb_lock); 104a482289dSNick Piggin put_page(page); /* free it into the hugepage allocator */ 105a482289dSNick Piggin return 1; 1061da177e4SLinus Torvalds } 107a482289dSNick Piggin return 0; 1081da177e4SLinus Torvalds } 1091da177e4SLinus Torvalds 1101da177e4SLinus Torvalds void free_huge_page(struct page *page) 1111da177e4SLinus Torvalds { 1121da177e4SLinus Torvalds BUG_ON(page_count(page)); 1131da177e4SLinus Torvalds 1141da177e4SLinus Torvalds INIT_LIST_HEAD(&page->lru); 1151da177e4SLinus Torvalds 1161da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 1171da177e4SLinus Torvalds enqueue_huge_page(page); 1181da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 1191da177e4SLinus Torvalds } 1201da177e4SLinus Torvalds 1215da7ca86SChristoph Lameter struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 1221da177e4SLinus Torvalds { 1231da177e4SLinus Torvalds struct page *page; 1241da177e4SLinus Torvalds 1251da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 1265da7ca86SChristoph Lameter page = dequeue_huge_page(vma, addr); 1271da177e4SLinus Torvalds if (!page) { 1281da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 1291da177e4SLinus Torvalds return NULL; 1301da177e4SLinus Torvalds } 1311da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 1327835e98bSNick Piggin set_page_refcounted(page); 1331da177e4SLinus Torvalds return page; 1341da177e4SLinus Torvalds } 1351da177e4SLinus Torvalds 1361da177e4SLinus Torvalds static int __init hugetlb_init(void) 1371da177e4SLinus Torvalds { 1381da177e4SLinus Torvalds unsigned long i; 1391da177e4SLinus Torvalds 1403c726f8dSBenjamin Herrenschmidt if (HPAGE_SHIFT == 0) 1413c726f8dSBenjamin Herrenschmidt return 0; 1423c726f8dSBenjamin Herrenschmidt 1431da177e4SLinus Torvalds for (i = 0; i < MAX_NUMNODES; ++i) 1441da177e4SLinus Torvalds INIT_LIST_HEAD(&hugepage_freelists[i]); 1451da177e4SLinus Torvalds 1461da177e4SLinus Torvalds for (i = 0; i < max_huge_pages; ++i) { 147a482289dSNick Piggin if (!alloc_fresh_huge_page()) 1481da177e4SLinus Torvalds break; 1491da177e4SLinus Torvalds } 1501da177e4SLinus Torvalds max_huge_pages = free_huge_pages = nr_huge_pages = i; 1511da177e4SLinus Torvalds printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 1521da177e4SLinus Torvalds return 0; 1531da177e4SLinus Torvalds } 1541da177e4SLinus Torvalds module_init(hugetlb_init); 1551da177e4SLinus Torvalds 1561da177e4SLinus Torvalds static int __init hugetlb_setup(char *s) 1571da177e4SLinus Torvalds { 1581da177e4SLinus Torvalds if (sscanf(s, "%lu", &max_huge_pages) <= 0) 1591da177e4SLinus Torvalds max_huge_pages = 0; 1601da177e4SLinus Torvalds return 1; 1611da177e4SLinus Torvalds } 1621da177e4SLinus Torvalds __setup("hugepages=", hugetlb_setup); 1631da177e4SLinus Torvalds 1641da177e4SLinus Torvalds #ifdef CONFIG_SYSCTL 1651da177e4SLinus Torvalds static void update_and_free_page(struct page *page) 1661da177e4SLinus Torvalds { 1671da177e4SLinus Torvalds int i; 1681da177e4SLinus Torvalds nr_huge_pages--; 1691da177e4SLinus Torvalds nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 1701da177e4SLinus Torvalds for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 1711da177e4SLinus Torvalds page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1721da177e4SLinus Torvalds 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1731da177e4SLinus Torvalds 1 << PG_private | 1<< PG_writeback); 1741da177e4SLinus Torvalds } 175a482289dSNick Piggin page[1].lru.next = NULL; 1767835e98bSNick Piggin set_page_refcounted(page); 1771da177e4SLinus Torvalds __free_pages(page, HUGETLB_PAGE_ORDER); 1781da177e4SLinus Torvalds } 1791da177e4SLinus Torvalds 1801da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM 1811da177e4SLinus Torvalds static void try_to_free_low(unsigned long count) 1821da177e4SLinus Torvalds { 1831da177e4SLinus Torvalds int i, nid; 1841da177e4SLinus Torvalds for (i = 0; i < MAX_NUMNODES; ++i) { 1851da177e4SLinus Torvalds struct page *page, *next; 1861da177e4SLinus Torvalds list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1871da177e4SLinus Torvalds if (PageHighMem(page)) 1881da177e4SLinus Torvalds continue; 1891da177e4SLinus Torvalds list_del(&page->lru); 1901da177e4SLinus Torvalds update_and_free_page(page); 1911da177e4SLinus Torvalds nid = page_zone(page)->zone_pgdat->node_id; 1921da177e4SLinus Torvalds free_huge_pages--; 1931da177e4SLinus Torvalds free_huge_pages_node[nid]--; 1941da177e4SLinus Torvalds if (count >= nr_huge_pages) 1951da177e4SLinus Torvalds return; 1961da177e4SLinus Torvalds } 1971da177e4SLinus Torvalds } 1981da177e4SLinus Torvalds } 1991da177e4SLinus Torvalds #else 2001da177e4SLinus Torvalds static inline void try_to_free_low(unsigned long count) 2011da177e4SLinus Torvalds { 2021da177e4SLinus Torvalds } 2031da177e4SLinus Torvalds #endif 2041da177e4SLinus Torvalds 2051da177e4SLinus Torvalds static unsigned long set_max_huge_pages(unsigned long count) 2061da177e4SLinus Torvalds { 2071da177e4SLinus Torvalds while (count > nr_huge_pages) { 208a482289dSNick Piggin if (!alloc_fresh_huge_page()) 2091da177e4SLinus Torvalds return nr_huge_pages; 2101da177e4SLinus Torvalds } 2111da177e4SLinus Torvalds if (count >= nr_huge_pages) 2121da177e4SLinus Torvalds return nr_huge_pages; 2131da177e4SLinus Torvalds 2141da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 2151da177e4SLinus Torvalds try_to_free_low(count); 2161da177e4SLinus Torvalds while (count < nr_huge_pages) { 2175da7ca86SChristoph Lameter struct page *page = dequeue_huge_page(NULL, 0); 2181da177e4SLinus Torvalds if (!page) 2191da177e4SLinus Torvalds break; 2201da177e4SLinus Torvalds update_and_free_page(page); 2211da177e4SLinus Torvalds } 2221da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 2231da177e4SLinus Torvalds return nr_huge_pages; 2241da177e4SLinus Torvalds } 2251da177e4SLinus Torvalds 2261da177e4SLinus Torvalds int hugetlb_sysctl_handler(struct ctl_table *table, int write, 2271da177e4SLinus Torvalds struct file *file, void __user *buffer, 2281da177e4SLinus Torvalds size_t *length, loff_t *ppos) 2291da177e4SLinus Torvalds { 2301da177e4SLinus Torvalds proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2311da177e4SLinus Torvalds max_huge_pages = set_max_huge_pages(max_huge_pages); 2321da177e4SLinus Torvalds return 0; 2331da177e4SLinus Torvalds } 2341da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */ 2351da177e4SLinus Torvalds 2361da177e4SLinus Torvalds int hugetlb_report_meminfo(char *buf) 2371da177e4SLinus Torvalds { 2381da177e4SLinus Torvalds return sprintf(buf, 2391da177e4SLinus Torvalds "HugePages_Total: %5lu\n" 2401da177e4SLinus Torvalds "HugePages_Free: %5lu\n" 2411da177e4SLinus Torvalds "Hugepagesize: %5lu kB\n", 2421da177e4SLinus Torvalds nr_huge_pages, 2431da177e4SLinus Torvalds free_huge_pages, 2441da177e4SLinus Torvalds HPAGE_SIZE/1024); 2451da177e4SLinus Torvalds } 2461da177e4SLinus Torvalds 2471da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf) 2481da177e4SLinus Torvalds { 2491da177e4SLinus Torvalds return sprintf(buf, 2501da177e4SLinus Torvalds "Node %d HugePages_Total: %5u\n" 2511da177e4SLinus Torvalds "Node %d HugePages_Free: %5u\n", 2521da177e4SLinus Torvalds nid, nr_huge_pages_node[nid], 2531da177e4SLinus Torvalds nid, free_huge_pages_node[nid]); 2541da177e4SLinus Torvalds } 2551da177e4SLinus Torvalds 2561da177e4SLinus Torvalds int is_hugepage_mem_enough(size_t size) 2571da177e4SLinus Torvalds { 2581da177e4SLinus Torvalds return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; 2591da177e4SLinus Torvalds } 2601da177e4SLinus Torvalds 2611da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2621da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void) 2631da177e4SLinus Torvalds { 2641da177e4SLinus Torvalds return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 2651da177e4SLinus Torvalds } 2661da177e4SLinus Torvalds 2671da177e4SLinus Torvalds /* 2681da177e4SLinus Torvalds * We cannot handle pagefaults against hugetlb pages at all. They cause 2691da177e4SLinus Torvalds * handle_mm_fault() to try to instantiate regular-sized pages in the 2701da177e4SLinus Torvalds * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 2711da177e4SLinus Torvalds * this far. 2721da177e4SLinus Torvalds */ 2731da177e4SLinus Torvalds static struct page *hugetlb_nopage(struct vm_area_struct *vma, 2741da177e4SLinus Torvalds unsigned long address, int *unused) 2751da177e4SLinus Torvalds { 2761da177e4SLinus Torvalds BUG(); 2771da177e4SLinus Torvalds return NULL; 2781da177e4SLinus Torvalds } 2791da177e4SLinus Torvalds 2801da177e4SLinus Torvalds struct vm_operations_struct hugetlb_vm_ops = { 2811da177e4SLinus Torvalds .nopage = hugetlb_nopage, 2821da177e4SLinus Torvalds }; 2831da177e4SLinus Torvalds 2841e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 2851e8f889bSDavid Gibson int writable) 28663551ae0SDavid Gibson { 28763551ae0SDavid Gibson pte_t entry; 28863551ae0SDavid Gibson 2891e8f889bSDavid Gibson if (writable) { 29063551ae0SDavid Gibson entry = 29163551ae0SDavid Gibson pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 29263551ae0SDavid Gibson } else { 29363551ae0SDavid Gibson entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 29463551ae0SDavid Gibson } 29563551ae0SDavid Gibson entry = pte_mkyoung(entry); 29663551ae0SDavid Gibson entry = pte_mkhuge(entry); 29763551ae0SDavid Gibson 29863551ae0SDavid Gibson return entry; 29963551ae0SDavid Gibson } 30063551ae0SDavid Gibson 3011e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma, 3021e8f889bSDavid Gibson unsigned long address, pte_t *ptep) 3031e8f889bSDavid Gibson { 3041e8f889bSDavid Gibson pte_t entry; 3051e8f889bSDavid Gibson 3061e8f889bSDavid Gibson entry = pte_mkwrite(pte_mkdirty(*ptep)); 3071e8f889bSDavid Gibson ptep_set_access_flags(vma, address, ptep, entry, 1); 3081e8f889bSDavid Gibson update_mmu_cache(vma, address, entry); 3091e8f889bSDavid Gibson lazy_mmu_prot_update(entry); 3101e8f889bSDavid Gibson } 3111e8f889bSDavid Gibson 3121e8f889bSDavid Gibson 31363551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 31463551ae0SDavid Gibson struct vm_area_struct *vma) 31563551ae0SDavid Gibson { 31663551ae0SDavid Gibson pte_t *src_pte, *dst_pte, entry; 31763551ae0SDavid Gibson struct page *ptepage; 3181c59827dSHugh Dickins unsigned long addr; 3191e8f889bSDavid Gibson int cow; 3201e8f889bSDavid Gibson 3211e8f889bSDavid Gibson cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 32263551ae0SDavid Gibson 3231c59827dSHugh Dickins for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 324c74df32cSHugh Dickins src_pte = huge_pte_offset(src, addr); 325c74df32cSHugh Dickins if (!src_pte) 326c74df32cSHugh Dickins continue; 32763551ae0SDavid Gibson dst_pte = huge_pte_alloc(dst, addr); 32863551ae0SDavid Gibson if (!dst_pte) 32963551ae0SDavid Gibson goto nomem; 330c74df32cSHugh Dickins spin_lock(&dst->page_table_lock); 3311c59827dSHugh Dickins spin_lock(&src->page_table_lock); 332c74df32cSHugh Dickins if (!pte_none(*src_pte)) { 3331e8f889bSDavid Gibson if (cow) 3341e8f889bSDavid Gibson ptep_set_wrprotect(src, addr, src_pte); 33563551ae0SDavid Gibson entry = *src_pte; 33663551ae0SDavid Gibson ptepage = pte_page(entry); 33763551ae0SDavid Gibson get_page(ptepage); 3384294621fSHugh Dickins add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); 33963551ae0SDavid Gibson set_huge_pte_at(dst, addr, dst_pte, entry); 3401c59827dSHugh Dickins } 3411c59827dSHugh Dickins spin_unlock(&src->page_table_lock); 342c74df32cSHugh Dickins spin_unlock(&dst->page_table_lock); 34363551ae0SDavid Gibson } 34463551ae0SDavid Gibson return 0; 34563551ae0SDavid Gibson 34663551ae0SDavid Gibson nomem: 34763551ae0SDavid Gibson return -ENOMEM; 34863551ae0SDavid Gibson } 34963551ae0SDavid Gibson 35063551ae0SDavid Gibson void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 35163551ae0SDavid Gibson unsigned long end) 35263551ae0SDavid Gibson { 35363551ae0SDavid Gibson struct mm_struct *mm = vma->vm_mm; 35463551ae0SDavid Gibson unsigned long address; 355c7546f8fSDavid Gibson pte_t *ptep; 35663551ae0SDavid Gibson pte_t pte; 35763551ae0SDavid Gibson struct page *page; 35863551ae0SDavid Gibson 35963551ae0SDavid Gibson WARN_ON(!is_vm_hugetlb_page(vma)); 36063551ae0SDavid Gibson BUG_ON(start & ~HPAGE_MASK); 36163551ae0SDavid Gibson BUG_ON(end & ~HPAGE_MASK); 36263551ae0SDavid Gibson 363508034a3SHugh Dickins spin_lock(&mm->page_table_lock); 364508034a3SHugh Dickins 365365e9c87SHugh Dickins /* Update high watermark before we lower rss */ 366365e9c87SHugh Dickins update_hiwater_rss(mm); 367365e9c87SHugh Dickins 36863551ae0SDavid Gibson for (address = start; address < end; address += HPAGE_SIZE) { 369c7546f8fSDavid Gibson ptep = huge_pte_offset(mm, address); 370c7546f8fSDavid Gibson if (!ptep) 371c7546f8fSDavid Gibson continue; 372c7546f8fSDavid Gibson 373c7546f8fSDavid Gibson pte = huge_ptep_get_and_clear(mm, address, ptep); 37463551ae0SDavid Gibson if (pte_none(pte)) 37563551ae0SDavid Gibson continue; 376c7546f8fSDavid Gibson 37763551ae0SDavid Gibson page = pte_page(pte); 37863551ae0SDavid Gibson put_page(page); 3794294621fSHugh Dickins add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); 38063551ae0SDavid Gibson } 38163551ae0SDavid Gibson 3821da177e4SLinus Torvalds spin_unlock(&mm->page_table_lock); 383508034a3SHugh Dickins flush_tlb_range(vma, start, end); 3841da177e4SLinus Torvalds } 38563551ae0SDavid Gibson 3861e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 3871e8f889bSDavid Gibson unsigned long address, pte_t *ptep, pte_t pte) 3881e8f889bSDavid Gibson { 3891e8f889bSDavid Gibson struct page *old_page, *new_page; 39079ac6ba4SDavid Gibson int avoidcopy; 3911e8f889bSDavid Gibson 3921e8f889bSDavid Gibson old_page = pte_page(pte); 3931e8f889bSDavid Gibson 3941e8f889bSDavid Gibson /* If no-one else is actually using this page, avoid the copy 3951e8f889bSDavid Gibson * and just make the page writable */ 3961e8f889bSDavid Gibson avoidcopy = (page_count(old_page) == 1); 3971e8f889bSDavid Gibson if (avoidcopy) { 3981e8f889bSDavid Gibson set_huge_ptep_writable(vma, address, ptep); 3991e8f889bSDavid Gibson return VM_FAULT_MINOR; 4001e8f889bSDavid Gibson } 4011e8f889bSDavid Gibson 4021e8f889bSDavid Gibson page_cache_get(old_page); 4035da7ca86SChristoph Lameter new_page = alloc_huge_page(vma, address); 4041e8f889bSDavid Gibson 4051e8f889bSDavid Gibson if (!new_page) { 4061e8f889bSDavid Gibson page_cache_release(old_page); 4070df420d8SChristoph Lameter return VM_FAULT_OOM; 4081e8f889bSDavid Gibson } 4091e8f889bSDavid Gibson 4101e8f889bSDavid Gibson spin_unlock(&mm->page_table_lock); 41179ac6ba4SDavid Gibson copy_huge_page(new_page, old_page, address); 4121e8f889bSDavid Gibson spin_lock(&mm->page_table_lock); 4131e8f889bSDavid Gibson 4141e8f889bSDavid Gibson ptep = huge_pte_offset(mm, address & HPAGE_MASK); 4151e8f889bSDavid Gibson if (likely(pte_same(*ptep, pte))) { 4161e8f889bSDavid Gibson /* Break COW */ 4171e8f889bSDavid Gibson set_huge_pte_at(mm, address, ptep, 4181e8f889bSDavid Gibson make_huge_pte(vma, new_page, 1)); 4191e8f889bSDavid Gibson /* Make the old page be freed below */ 4201e8f889bSDavid Gibson new_page = old_page; 4211e8f889bSDavid Gibson } 4221e8f889bSDavid Gibson page_cache_release(new_page); 4231e8f889bSDavid Gibson page_cache_release(old_page); 4241e8f889bSDavid Gibson return VM_FAULT_MINOR; 4251e8f889bSDavid Gibson } 4261e8f889bSDavid Gibson 42786e5216fSAdam Litke int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 4281e8f889bSDavid Gibson unsigned long address, pte_t *ptep, int write_access) 429ac9b9c66SHugh Dickins { 430ac9b9c66SHugh Dickins int ret = VM_FAULT_SIGBUS; 4314c887265SAdam Litke unsigned long idx; 4324c887265SAdam Litke unsigned long size; 4334c887265SAdam Litke struct page *page; 4344c887265SAdam Litke struct address_space *mapping; 4351e8f889bSDavid Gibson pte_t new_pte; 4364c887265SAdam Litke 4374c887265SAdam Litke mapping = vma->vm_file->f_mapping; 4384c887265SAdam Litke idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 4394c887265SAdam Litke + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 4404c887265SAdam Litke 4414c887265SAdam Litke /* 4424c887265SAdam Litke * Use page lock to guard against racing truncation 4434c887265SAdam Litke * before we get page_table_lock. 4444c887265SAdam Litke */ 4456bda666aSChristoph Lameter retry: 4466bda666aSChristoph Lameter page = find_lock_page(mapping, idx); 4476bda666aSChristoph Lameter if (!page) { 4486bda666aSChristoph Lameter if (hugetlb_get_quota(mapping)) 4494c887265SAdam Litke goto out; 4506bda666aSChristoph Lameter page = alloc_huge_page(vma, address); 4516bda666aSChristoph Lameter if (!page) { 4526bda666aSChristoph Lameter hugetlb_put_quota(mapping); 4530df420d8SChristoph Lameter ret = VM_FAULT_OOM; 4546bda666aSChristoph Lameter goto out; 4556bda666aSChristoph Lameter } 45679ac6ba4SDavid Gibson clear_huge_page(page, address); 457ac9b9c66SHugh Dickins 4586bda666aSChristoph Lameter if (vma->vm_flags & VM_SHARED) { 4596bda666aSChristoph Lameter int err; 4606bda666aSChristoph Lameter 4616bda666aSChristoph Lameter err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 4626bda666aSChristoph Lameter if (err) { 4636bda666aSChristoph Lameter put_page(page); 4646bda666aSChristoph Lameter hugetlb_put_quota(mapping); 4656bda666aSChristoph Lameter if (err == -EEXIST) 4666bda666aSChristoph Lameter goto retry; 4676bda666aSChristoph Lameter goto out; 4686bda666aSChristoph Lameter } 4696bda666aSChristoph Lameter } else 4706bda666aSChristoph Lameter lock_page(page); 4716bda666aSChristoph Lameter } 4721e8f889bSDavid Gibson 473ac9b9c66SHugh Dickins spin_lock(&mm->page_table_lock); 4744c887265SAdam Litke size = i_size_read(mapping->host) >> HPAGE_SHIFT; 4754c887265SAdam Litke if (idx >= size) 4764c887265SAdam Litke goto backout; 4774c887265SAdam Litke 478ac9b9c66SHugh Dickins ret = VM_FAULT_MINOR; 47986e5216fSAdam Litke if (!pte_none(*ptep)) 4804c887265SAdam Litke goto backout; 4814c887265SAdam Litke 4824c887265SAdam Litke add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 4831e8f889bSDavid Gibson new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 4841e8f889bSDavid Gibson && (vma->vm_flags & VM_SHARED))); 4851e8f889bSDavid Gibson set_huge_pte_at(mm, address, ptep, new_pte); 4861e8f889bSDavid Gibson 4871e8f889bSDavid Gibson if (write_access && !(vma->vm_flags & VM_SHARED)) { 4881e8f889bSDavid Gibson /* Optimization, do the COW without a second fault */ 4891e8f889bSDavid Gibson ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 4901e8f889bSDavid Gibson } 4911e8f889bSDavid Gibson 492ac9b9c66SHugh Dickins spin_unlock(&mm->page_table_lock); 4934c887265SAdam Litke unlock_page(page); 4944c887265SAdam Litke out: 495ac9b9c66SHugh Dickins return ret; 4964c887265SAdam Litke 4974c887265SAdam Litke backout: 4984c887265SAdam Litke spin_unlock(&mm->page_table_lock); 4994c887265SAdam Litke hugetlb_put_quota(mapping); 5004c887265SAdam Litke unlock_page(page); 5014c887265SAdam Litke put_page(page); 5024c887265SAdam Litke goto out; 503ac9b9c66SHugh Dickins } 504ac9b9c66SHugh Dickins 50586e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 50686e5216fSAdam Litke unsigned long address, int write_access) 50786e5216fSAdam Litke { 50886e5216fSAdam Litke pte_t *ptep; 50986e5216fSAdam Litke pte_t entry; 5101e8f889bSDavid Gibson int ret; 5113935baa9SDavid Gibson static DEFINE_MUTEX(hugetlb_instantiation_mutex); 51286e5216fSAdam Litke 51386e5216fSAdam Litke ptep = huge_pte_alloc(mm, address); 51486e5216fSAdam Litke if (!ptep) 51586e5216fSAdam Litke return VM_FAULT_OOM; 51686e5216fSAdam Litke 5173935baa9SDavid Gibson /* 5183935baa9SDavid Gibson * Serialize hugepage allocation and instantiation, so that we don't 5193935baa9SDavid Gibson * get spurious allocation failures if two CPUs race to instantiate 5203935baa9SDavid Gibson * the same page in the page cache. 5213935baa9SDavid Gibson */ 5223935baa9SDavid Gibson mutex_lock(&hugetlb_instantiation_mutex); 52386e5216fSAdam Litke entry = *ptep; 5243935baa9SDavid Gibson if (pte_none(entry)) { 5253935baa9SDavid Gibson ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 5263935baa9SDavid Gibson mutex_unlock(&hugetlb_instantiation_mutex); 5273935baa9SDavid Gibson return ret; 5283935baa9SDavid Gibson } 52986e5216fSAdam Litke 5301e8f889bSDavid Gibson ret = VM_FAULT_MINOR; 5311e8f889bSDavid Gibson 5321e8f889bSDavid Gibson spin_lock(&mm->page_table_lock); 5331e8f889bSDavid Gibson /* Check for a racing update before calling hugetlb_cow */ 5341e8f889bSDavid Gibson if (likely(pte_same(entry, *ptep))) 5351e8f889bSDavid Gibson if (write_access && !pte_write(entry)) 5361e8f889bSDavid Gibson ret = hugetlb_cow(mm, vma, address, ptep, entry); 5371e8f889bSDavid Gibson spin_unlock(&mm->page_table_lock); 5383935baa9SDavid Gibson mutex_unlock(&hugetlb_instantiation_mutex); 5391e8f889bSDavid Gibson 5401e8f889bSDavid Gibson return ret; 54186e5216fSAdam Litke } 54286e5216fSAdam Litke 54363551ae0SDavid Gibson int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 54463551ae0SDavid Gibson struct page **pages, struct vm_area_struct **vmas, 54563551ae0SDavid Gibson unsigned long *position, int *length, int i) 54663551ae0SDavid Gibson { 54763551ae0SDavid Gibson unsigned long vpfn, vaddr = *position; 54863551ae0SDavid Gibson int remainder = *length; 54963551ae0SDavid Gibson 55063551ae0SDavid Gibson vpfn = vaddr/PAGE_SIZE; 5511c59827dSHugh Dickins spin_lock(&mm->page_table_lock); 55263551ae0SDavid Gibson while (vaddr < vma->vm_end && remainder) { 55363551ae0SDavid Gibson pte_t *pte; 55463551ae0SDavid Gibson struct page *page; 55563551ae0SDavid Gibson 5564c887265SAdam Litke /* 5574c887265SAdam Litke * Some archs (sparc64, sh*) have multiple pte_ts to 5584c887265SAdam Litke * each hugepage. We have to make * sure we get the 5594c887265SAdam Litke * first, for the page indexing below to work. 5604c887265SAdam Litke */ 56163551ae0SDavid Gibson pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 56263551ae0SDavid Gibson 5631c59827dSHugh Dickins if (!pte || pte_none(*pte)) { 5644c887265SAdam Litke int ret; 5654c887265SAdam Litke 5664c887265SAdam Litke spin_unlock(&mm->page_table_lock); 5674c887265SAdam Litke ret = hugetlb_fault(mm, vma, vaddr, 0); 5684c887265SAdam Litke spin_lock(&mm->page_table_lock); 5694c887265SAdam Litke if (ret == VM_FAULT_MINOR) 5704c887265SAdam Litke continue; 5714c887265SAdam Litke 5721c59827dSHugh Dickins remainder = 0; 5731c59827dSHugh Dickins if (!i) 5741c59827dSHugh Dickins i = -EFAULT; 5751c59827dSHugh Dickins break; 5761c59827dSHugh Dickins } 57763551ae0SDavid Gibson 5784c887265SAdam Litke if (pages) { 57963551ae0SDavid Gibson page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 58063551ae0SDavid Gibson get_page(page); 58163551ae0SDavid Gibson pages[i] = page; 58263551ae0SDavid Gibson } 58363551ae0SDavid Gibson 58463551ae0SDavid Gibson if (vmas) 58563551ae0SDavid Gibson vmas[i] = vma; 58663551ae0SDavid Gibson 58763551ae0SDavid Gibson vaddr += PAGE_SIZE; 58863551ae0SDavid Gibson ++vpfn; 58963551ae0SDavid Gibson --remainder; 59063551ae0SDavid Gibson ++i; 59163551ae0SDavid Gibson } 5921c59827dSHugh Dickins spin_unlock(&mm->page_table_lock); 59363551ae0SDavid Gibson *length = remainder; 59463551ae0SDavid Gibson *position = vaddr; 59563551ae0SDavid Gibson 59663551ae0SDavid Gibson return i; 59763551ae0SDavid Gibson } 5988f860591SZhang, Yanmin 5998f860591SZhang, Yanmin void hugetlb_change_protection(struct vm_area_struct *vma, 6008f860591SZhang, Yanmin unsigned long address, unsigned long end, pgprot_t newprot) 6018f860591SZhang, Yanmin { 6028f860591SZhang, Yanmin struct mm_struct *mm = vma->vm_mm; 6038f860591SZhang, Yanmin unsigned long start = address; 6048f860591SZhang, Yanmin pte_t *ptep; 6058f860591SZhang, Yanmin pte_t pte; 6068f860591SZhang, Yanmin 6078f860591SZhang, Yanmin BUG_ON(address >= end); 6088f860591SZhang, Yanmin flush_cache_range(vma, address, end); 6098f860591SZhang, Yanmin 6108f860591SZhang, Yanmin spin_lock(&mm->page_table_lock); 6118f860591SZhang, Yanmin for (; address < end; address += HPAGE_SIZE) { 6128f860591SZhang, Yanmin ptep = huge_pte_offset(mm, address); 6138f860591SZhang, Yanmin if (!ptep) 6148f860591SZhang, Yanmin continue; 6158f860591SZhang, Yanmin if (!pte_none(*ptep)) { 6168f860591SZhang, Yanmin pte = huge_ptep_get_and_clear(mm, address, ptep); 6178f860591SZhang, Yanmin pte = pte_mkhuge(pte_modify(pte, newprot)); 6188f860591SZhang, Yanmin set_huge_pte_at(mm, address, ptep, pte); 6198f860591SZhang, Yanmin lazy_mmu_prot_update(pte); 6208f860591SZhang, Yanmin } 6218f860591SZhang, Yanmin } 6228f860591SZhang, Yanmin spin_unlock(&mm->page_table_lock); 6238f860591SZhang, Yanmin 6248f860591SZhang, Yanmin flush_tlb_range(vma, start, end); 6258f860591SZhang, Yanmin } 6268f860591SZhang, Yanmin 627