11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Generic hugetlb support. 31da177e4SLinus Torvalds * (C) William Irwin, April 2004 41da177e4SLinus Torvalds */ 51da177e4SLinus Torvalds #include <linux/list.h> 61da177e4SLinus Torvalds #include <linux/init.h> 71da177e4SLinus Torvalds #include <linux/module.h> 81da177e4SLinus Torvalds #include <linux/mm.h> 9e1759c21SAlexey Dobriyan #include <linux/seq_file.h> 101da177e4SLinus Torvalds #include <linux/sysctl.h> 111da177e4SLinus Torvalds #include <linux/highmem.h> 12cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h> 131da177e4SLinus Torvalds #include <linux/nodemask.h> 1463551ae0SDavid Gibson #include <linux/pagemap.h> 155da7ca86SChristoph Lameter #include <linux/mempolicy.h> 16aea47ff3SChristoph Lameter #include <linux/cpuset.h> 173935baa9SDavid Gibson #include <linux/mutex.h> 18aa888a74SAndi Kleen #include <linux/bootmem.h> 19a3437870SNishanth Aravamudan #include <linux/sysfs.h> 205a0e3ad6STejun Heo #include <linux/slab.h> 210fe6e20bSNaoya Horiguchi #include <linux/rmap.h> 22fd6a03edSNaoya Horiguchi #include <linux/swap.h> 23fd6a03edSNaoya Horiguchi #include <linux/swapops.h> 24d6606683SLinus Torvalds 2563551ae0SDavid Gibson #include <asm/page.h> 2663551ae0SDavid Gibson #include <asm/pgtable.h> 2778a34ae2SAdrian Bunk #include <asm/io.h> 2863551ae0SDavid Gibson 2963551ae0SDavid Gibson #include <linux/hugetlb.h> 309a305230SLee Schermerhorn #include <linux/node.h> 317835e98bSNick Piggin #include "internal.h" 321da177e4SLinus Torvalds 331da177e4SLinus Torvalds const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 34396faf03SMel Gorman static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35396faf03SMel Gorman unsigned long hugepages_treat_as_movable; 36a5516438SAndi Kleen 37e5ff2159SAndi Kleen static int max_hstate; 38e5ff2159SAndi Kleen unsigned int default_hstate_idx; 39e5ff2159SAndi Kleen struct hstate hstates[HUGE_MAX_HSTATE]; 40e5ff2159SAndi Kleen 4153ba51d2SJon Tollefson __initdata LIST_HEAD(huge_boot_pages); 4253ba51d2SJon Tollefson 43e5ff2159SAndi Kleen /* for command line parsing */ 44e5ff2159SAndi Kleen static struct hstate * __initdata parsed_hstate; 45e5ff2159SAndi Kleen static unsigned long __initdata default_hstate_max_huge_pages; 46e11bfbfcSNick Piggin static unsigned long __initdata default_hstate_size; 47e5ff2159SAndi Kleen 48e5ff2159SAndi Kleen #define for_each_hstate(h) \ 49e5ff2159SAndi Kleen for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) 50396faf03SMel Gorman 513935baa9SDavid Gibson /* 523935baa9SDavid Gibson * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 533935baa9SDavid Gibson */ 543935baa9SDavid Gibson static DEFINE_SPINLOCK(hugetlb_lock); 550bd0f9fbSEric Paris 56e7c4b0bfSAndy Whitcroft /* 5796822904SAndy Whitcroft * Region tracking -- allows tracking of reservations and instantiated pages 5896822904SAndy Whitcroft * across the pages in a mapping. 5984afd99bSAndy Whitcroft * 6084afd99bSAndy Whitcroft * The region data structures are protected by a combination of the mmap_sem 6184afd99bSAndy Whitcroft * and the hugetlb_instantion_mutex. To access or modify a region the caller 6284afd99bSAndy Whitcroft * must either hold the mmap_sem for write, or the mmap_sem for read and 6384afd99bSAndy Whitcroft * the hugetlb_instantiation mutex: 6484afd99bSAndy Whitcroft * 6584afd99bSAndy Whitcroft * down_write(&mm->mmap_sem); 6684afd99bSAndy Whitcroft * or 6784afd99bSAndy Whitcroft * down_read(&mm->mmap_sem); 6884afd99bSAndy Whitcroft * mutex_lock(&hugetlb_instantiation_mutex); 6996822904SAndy Whitcroft */ 7096822904SAndy Whitcroft struct file_region { 7196822904SAndy Whitcroft struct list_head link; 7296822904SAndy Whitcroft long from; 7396822904SAndy Whitcroft long to; 7496822904SAndy Whitcroft }; 7596822904SAndy Whitcroft 7696822904SAndy Whitcroft static long region_add(struct list_head *head, long f, long t) 7796822904SAndy Whitcroft { 7896822904SAndy Whitcroft struct file_region *rg, *nrg, *trg; 7996822904SAndy Whitcroft 8096822904SAndy Whitcroft /* Locate the region we are either in or before. */ 8196822904SAndy Whitcroft list_for_each_entry(rg, head, link) 8296822904SAndy Whitcroft if (f <= rg->to) 8396822904SAndy Whitcroft break; 8496822904SAndy Whitcroft 8596822904SAndy Whitcroft /* Round our left edge to the current segment if it encloses us. */ 8696822904SAndy Whitcroft if (f > rg->from) 8796822904SAndy Whitcroft f = rg->from; 8896822904SAndy Whitcroft 8996822904SAndy Whitcroft /* Check for and consume any regions we now overlap with. */ 9096822904SAndy Whitcroft nrg = rg; 9196822904SAndy Whitcroft list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 9296822904SAndy Whitcroft if (&rg->link == head) 9396822904SAndy Whitcroft break; 9496822904SAndy Whitcroft if (rg->from > t) 9596822904SAndy Whitcroft break; 9696822904SAndy Whitcroft 9796822904SAndy Whitcroft /* If this area reaches higher then extend our area to 9896822904SAndy Whitcroft * include it completely. If this is not the first area 9996822904SAndy Whitcroft * which we intend to reuse, free it. */ 10096822904SAndy Whitcroft if (rg->to > t) 10196822904SAndy Whitcroft t = rg->to; 10296822904SAndy Whitcroft if (rg != nrg) { 10396822904SAndy Whitcroft list_del(&rg->link); 10496822904SAndy Whitcroft kfree(rg); 10596822904SAndy Whitcroft } 10696822904SAndy Whitcroft } 10796822904SAndy Whitcroft nrg->from = f; 10896822904SAndy Whitcroft nrg->to = t; 10996822904SAndy Whitcroft return 0; 11096822904SAndy Whitcroft } 11196822904SAndy Whitcroft 11296822904SAndy Whitcroft static long region_chg(struct list_head *head, long f, long t) 11396822904SAndy Whitcroft { 11496822904SAndy Whitcroft struct file_region *rg, *nrg; 11596822904SAndy Whitcroft long chg = 0; 11696822904SAndy Whitcroft 11796822904SAndy Whitcroft /* Locate the region we are before or in. */ 11896822904SAndy Whitcroft list_for_each_entry(rg, head, link) 11996822904SAndy Whitcroft if (f <= rg->to) 12096822904SAndy Whitcroft break; 12196822904SAndy Whitcroft 12296822904SAndy Whitcroft /* If we are below the current region then a new region is required. 12396822904SAndy Whitcroft * Subtle, allocate a new region at the position but make it zero 12496822904SAndy Whitcroft * size such that we can guarantee to record the reservation. */ 12596822904SAndy Whitcroft if (&rg->link == head || t < rg->from) { 12696822904SAndy Whitcroft nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 12796822904SAndy Whitcroft if (!nrg) 12896822904SAndy Whitcroft return -ENOMEM; 12996822904SAndy Whitcroft nrg->from = f; 13096822904SAndy Whitcroft nrg->to = f; 13196822904SAndy Whitcroft INIT_LIST_HEAD(&nrg->link); 13296822904SAndy Whitcroft list_add(&nrg->link, rg->link.prev); 13396822904SAndy Whitcroft 13496822904SAndy Whitcroft return t - f; 13596822904SAndy Whitcroft } 13696822904SAndy Whitcroft 13796822904SAndy Whitcroft /* Round our left edge to the current segment if it encloses us. */ 13896822904SAndy Whitcroft if (f > rg->from) 13996822904SAndy Whitcroft f = rg->from; 14096822904SAndy Whitcroft chg = t - f; 14196822904SAndy Whitcroft 14296822904SAndy Whitcroft /* Check for and consume any regions we now overlap with. */ 14396822904SAndy Whitcroft list_for_each_entry(rg, rg->link.prev, link) { 14496822904SAndy Whitcroft if (&rg->link == head) 14596822904SAndy Whitcroft break; 14696822904SAndy Whitcroft if (rg->from > t) 14796822904SAndy Whitcroft return chg; 14896822904SAndy Whitcroft 14996822904SAndy Whitcroft /* We overlap with this area, if it extends futher than 15096822904SAndy Whitcroft * us then we must extend ourselves. Account for its 15196822904SAndy Whitcroft * existing reservation. */ 15296822904SAndy Whitcroft if (rg->to > t) { 15396822904SAndy Whitcroft chg += rg->to - t; 15496822904SAndy Whitcroft t = rg->to; 15596822904SAndy Whitcroft } 15696822904SAndy Whitcroft chg -= rg->to - rg->from; 15796822904SAndy Whitcroft } 15896822904SAndy Whitcroft return chg; 15996822904SAndy Whitcroft } 16096822904SAndy Whitcroft 16196822904SAndy Whitcroft static long region_truncate(struct list_head *head, long end) 16296822904SAndy Whitcroft { 16396822904SAndy Whitcroft struct file_region *rg, *trg; 16496822904SAndy Whitcroft long chg = 0; 16596822904SAndy Whitcroft 16696822904SAndy Whitcroft /* Locate the region we are either in or before. */ 16796822904SAndy Whitcroft list_for_each_entry(rg, head, link) 16896822904SAndy Whitcroft if (end <= rg->to) 16996822904SAndy Whitcroft break; 17096822904SAndy Whitcroft if (&rg->link == head) 17196822904SAndy Whitcroft return 0; 17296822904SAndy Whitcroft 17396822904SAndy Whitcroft /* If we are in the middle of a region then adjust it. */ 17496822904SAndy Whitcroft if (end > rg->from) { 17596822904SAndy Whitcroft chg = rg->to - end; 17696822904SAndy Whitcroft rg->to = end; 17796822904SAndy Whitcroft rg = list_entry(rg->link.next, typeof(*rg), link); 17896822904SAndy Whitcroft } 17996822904SAndy Whitcroft 18096822904SAndy Whitcroft /* Drop any remaining regions. */ 18196822904SAndy Whitcroft list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 18296822904SAndy Whitcroft if (&rg->link == head) 18396822904SAndy Whitcroft break; 18496822904SAndy Whitcroft chg += rg->to - rg->from; 18596822904SAndy Whitcroft list_del(&rg->link); 18696822904SAndy Whitcroft kfree(rg); 18796822904SAndy Whitcroft } 18896822904SAndy Whitcroft return chg; 18996822904SAndy Whitcroft } 19096822904SAndy Whitcroft 19184afd99bSAndy Whitcroft static long region_count(struct list_head *head, long f, long t) 19284afd99bSAndy Whitcroft { 19384afd99bSAndy Whitcroft struct file_region *rg; 19484afd99bSAndy Whitcroft long chg = 0; 19584afd99bSAndy Whitcroft 19684afd99bSAndy Whitcroft /* Locate each segment we overlap with, and count that overlap. */ 19784afd99bSAndy Whitcroft list_for_each_entry(rg, head, link) { 19884afd99bSAndy Whitcroft int seg_from; 19984afd99bSAndy Whitcroft int seg_to; 20084afd99bSAndy Whitcroft 20184afd99bSAndy Whitcroft if (rg->to <= f) 20284afd99bSAndy Whitcroft continue; 20384afd99bSAndy Whitcroft if (rg->from >= t) 20484afd99bSAndy Whitcroft break; 20584afd99bSAndy Whitcroft 20684afd99bSAndy Whitcroft seg_from = max(rg->from, f); 20784afd99bSAndy Whitcroft seg_to = min(rg->to, t); 20884afd99bSAndy Whitcroft 20984afd99bSAndy Whitcroft chg += seg_to - seg_from; 21084afd99bSAndy Whitcroft } 21184afd99bSAndy Whitcroft 21284afd99bSAndy Whitcroft return chg; 21384afd99bSAndy Whitcroft } 21484afd99bSAndy Whitcroft 21596822904SAndy Whitcroft /* 216e7c4b0bfSAndy Whitcroft * Convert the address within this vma to the page offset within 217e7c4b0bfSAndy Whitcroft * the mapping, in pagecache page units; huge pages here. 218e7c4b0bfSAndy Whitcroft */ 219a5516438SAndi Kleen static pgoff_t vma_hugecache_offset(struct hstate *h, 220a5516438SAndi Kleen struct vm_area_struct *vma, unsigned long address) 221e7c4b0bfSAndy Whitcroft { 222a5516438SAndi Kleen return ((address - vma->vm_start) >> huge_page_shift(h)) + 223a5516438SAndi Kleen (vma->vm_pgoff >> huge_page_order(h)); 224e7c4b0bfSAndy Whitcroft } 225e7c4b0bfSAndy Whitcroft 2260fe6e20bSNaoya Horiguchi pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 2270fe6e20bSNaoya Horiguchi unsigned long address) 2280fe6e20bSNaoya Horiguchi { 2290fe6e20bSNaoya Horiguchi return vma_hugecache_offset(hstate_vma(vma), vma, address); 2300fe6e20bSNaoya Horiguchi } 2310fe6e20bSNaoya Horiguchi 23284afd99bSAndy Whitcroft /* 23308fba699SMel Gorman * Return the size of the pages allocated when backing a VMA. In the majority 23408fba699SMel Gorman * cases this will be same size as used by the page table entries. 23508fba699SMel Gorman */ 23608fba699SMel Gorman unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 23708fba699SMel Gorman { 23808fba699SMel Gorman struct hstate *hstate; 23908fba699SMel Gorman 24008fba699SMel Gorman if (!is_vm_hugetlb_page(vma)) 24108fba699SMel Gorman return PAGE_SIZE; 24208fba699SMel Gorman 24308fba699SMel Gorman hstate = hstate_vma(vma); 24408fba699SMel Gorman 24508fba699SMel Gorman return 1UL << (hstate->order + PAGE_SHIFT); 24608fba699SMel Gorman } 247f340ca0fSJoerg Roedel EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 24808fba699SMel Gorman 24908fba699SMel Gorman /* 2503340289dSMel Gorman * Return the page size being used by the MMU to back a VMA. In the majority 2513340289dSMel Gorman * of cases, the page size used by the kernel matches the MMU size. On 2523340289dSMel Gorman * architectures where it differs, an architecture-specific version of this 2533340289dSMel Gorman * function is required. 2543340289dSMel Gorman */ 2553340289dSMel Gorman #ifndef vma_mmu_pagesize 2563340289dSMel Gorman unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 2573340289dSMel Gorman { 2583340289dSMel Gorman return vma_kernel_pagesize(vma); 2593340289dSMel Gorman } 2603340289dSMel Gorman #endif 2613340289dSMel Gorman 2623340289dSMel Gorman /* 26384afd99bSAndy Whitcroft * Flags for MAP_PRIVATE reservations. These are stored in the bottom 26484afd99bSAndy Whitcroft * bits of the reservation map pointer, which are always clear due to 26584afd99bSAndy Whitcroft * alignment. 26684afd99bSAndy Whitcroft */ 26784afd99bSAndy Whitcroft #define HPAGE_RESV_OWNER (1UL << 0) 26884afd99bSAndy Whitcroft #define HPAGE_RESV_UNMAPPED (1UL << 1) 26904f2cbe3SMel Gorman #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 27084afd99bSAndy Whitcroft 271a1e78772SMel Gorman /* 272a1e78772SMel Gorman * These helpers are used to track how many pages are reserved for 273a1e78772SMel Gorman * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 274a1e78772SMel Gorman * is guaranteed to have their future faults succeed. 275a1e78772SMel Gorman * 276a1e78772SMel Gorman * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 277a1e78772SMel Gorman * the reserve counters are updated with the hugetlb_lock held. It is safe 278a1e78772SMel Gorman * to reset the VMA at fork() time as it is not in use yet and there is no 279a1e78772SMel Gorman * chance of the global counters getting corrupted as a result of the values. 28084afd99bSAndy Whitcroft * 28184afd99bSAndy Whitcroft * The private mapping reservation is represented in a subtly different 28284afd99bSAndy Whitcroft * manner to a shared mapping. A shared mapping has a region map associated 28384afd99bSAndy Whitcroft * with the underlying file, this region map represents the backing file 28484afd99bSAndy Whitcroft * pages which have ever had a reservation assigned which this persists even 28584afd99bSAndy Whitcroft * after the page is instantiated. A private mapping has a region map 28684afd99bSAndy Whitcroft * associated with the original mmap which is attached to all VMAs which 28784afd99bSAndy Whitcroft * reference it, this region map represents those offsets which have consumed 28884afd99bSAndy Whitcroft * reservation ie. where pages have been instantiated. 289a1e78772SMel Gorman */ 290e7c4b0bfSAndy Whitcroft static unsigned long get_vma_private_data(struct vm_area_struct *vma) 291e7c4b0bfSAndy Whitcroft { 292e7c4b0bfSAndy Whitcroft return (unsigned long)vma->vm_private_data; 293e7c4b0bfSAndy Whitcroft } 294e7c4b0bfSAndy Whitcroft 295e7c4b0bfSAndy Whitcroft static void set_vma_private_data(struct vm_area_struct *vma, 296e7c4b0bfSAndy Whitcroft unsigned long value) 297e7c4b0bfSAndy Whitcroft { 298e7c4b0bfSAndy Whitcroft vma->vm_private_data = (void *)value; 299e7c4b0bfSAndy Whitcroft } 300e7c4b0bfSAndy Whitcroft 30184afd99bSAndy Whitcroft struct resv_map { 30284afd99bSAndy Whitcroft struct kref refs; 30384afd99bSAndy Whitcroft struct list_head regions; 30484afd99bSAndy Whitcroft }; 30584afd99bSAndy Whitcroft 3062a4b3dedSHarvey Harrison static struct resv_map *resv_map_alloc(void) 30784afd99bSAndy Whitcroft { 30884afd99bSAndy Whitcroft struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 30984afd99bSAndy Whitcroft if (!resv_map) 31084afd99bSAndy Whitcroft return NULL; 31184afd99bSAndy Whitcroft 31284afd99bSAndy Whitcroft kref_init(&resv_map->refs); 31384afd99bSAndy Whitcroft INIT_LIST_HEAD(&resv_map->regions); 31484afd99bSAndy Whitcroft 31584afd99bSAndy Whitcroft return resv_map; 31684afd99bSAndy Whitcroft } 31784afd99bSAndy Whitcroft 3182a4b3dedSHarvey Harrison static void resv_map_release(struct kref *ref) 31984afd99bSAndy Whitcroft { 32084afd99bSAndy Whitcroft struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 32184afd99bSAndy Whitcroft 32284afd99bSAndy Whitcroft /* Clear out any active regions before we release the map. */ 32384afd99bSAndy Whitcroft region_truncate(&resv_map->regions, 0); 32484afd99bSAndy Whitcroft kfree(resv_map); 32584afd99bSAndy Whitcroft } 32684afd99bSAndy Whitcroft 32784afd99bSAndy Whitcroft static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 328a1e78772SMel Gorman { 329a1e78772SMel Gorman VM_BUG_ON(!is_vm_hugetlb_page(vma)); 330f83a275dSMel Gorman if (!(vma->vm_flags & VM_MAYSHARE)) 33184afd99bSAndy Whitcroft return (struct resv_map *)(get_vma_private_data(vma) & 33284afd99bSAndy Whitcroft ~HPAGE_RESV_MASK); 3332a4b3dedSHarvey Harrison return NULL; 334a1e78772SMel Gorman } 335a1e78772SMel Gorman 33684afd99bSAndy Whitcroft static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 337a1e78772SMel Gorman { 338a1e78772SMel Gorman VM_BUG_ON(!is_vm_hugetlb_page(vma)); 339f83a275dSMel Gorman VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 340a1e78772SMel Gorman 34184afd99bSAndy Whitcroft set_vma_private_data(vma, (get_vma_private_data(vma) & 34284afd99bSAndy Whitcroft HPAGE_RESV_MASK) | (unsigned long)map); 34304f2cbe3SMel Gorman } 34404f2cbe3SMel Gorman 34504f2cbe3SMel Gorman static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 34604f2cbe3SMel Gorman { 34704f2cbe3SMel Gorman VM_BUG_ON(!is_vm_hugetlb_page(vma)); 348f83a275dSMel Gorman VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 349e7c4b0bfSAndy Whitcroft 350e7c4b0bfSAndy Whitcroft set_vma_private_data(vma, get_vma_private_data(vma) | flags); 35104f2cbe3SMel Gorman } 35204f2cbe3SMel Gorman 35304f2cbe3SMel Gorman static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 35404f2cbe3SMel Gorman { 35504f2cbe3SMel Gorman VM_BUG_ON(!is_vm_hugetlb_page(vma)); 356e7c4b0bfSAndy Whitcroft 357e7c4b0bfSAndy Whitcroft return (get_vma_private_data(vma) & flag) != 0; 358a1e78772SMel Gorman } 359a1e78772SMel Gorman 360a1e78772SMel Gorman /* Decrement the reserved pages in the hugepage pool by one */ 361a5516438SAndi Kleen static void decrement_hugepage_resv_vma(struct hstate *h, 362a5516438SAndi Kleen struct vm_area_struct *vma) 363a1e78772SMel Gorman { 364c37f9fb1SAndy Whitcroft if (vma->vm_flags & VM_NORESERVE) 365c37f9fb1SAndy Whitcroft return; 366c37f9fb1SAndy Whitcroft 367f83a275dSMel Gorman if (vma->vm_flags & VM_MAYSHARE) { 368a1e78772SMel Gorman /* Shared mappings always use reserves */ 369a5516438SAndi Kleen h->resv_huge_pages--; 37084afd99bSAndy Whitcroft } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 371a1e78772SMel Gorman /* 372a1e78772SMel Gorman * Only the process that called mmap() has reserves for 373a1e78772SMel Gorman * private mappings. 374a1e78772SMel Gorman */ 375a5516438SAndi Kleen h->resv_huge_pages--; 376a1e78772SMel Gorman } 377a1e78772SMel Gorman } 378a1e78772SMel Gorman 37904f2cbe3SMel Gorman /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 380a1e78772SMel Gorman void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 381a1e78772SMel Gorman { 382a1e78772SMel Gorman VM_BUG_ON(!is_vm_hugetlb_page(vma)); 383f83a275dSMel Gorman if (!(vma->vm_flags & VM_MAYSHARE)) 384a1e78772SMel Gorman vma->vm_private_data = (void *)0; 385a1e78772SMel Gorman } 386a1e78772SMel Gorman 387a1e78772SMel Gorman /* Returns true if the VMA has associated reserve pages */ 3887f09ca51SMel Gorman static int vma_has_reserves(struct vm_area_struct *vma) 389a1e78772SMel Gorman { 390f83a275dSMel Gorman if (vma->vm_flags & VM_MAYSHARE) 391a1e78772SMel Gorman return 1; 3927f09ca51SMel Gorman if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3937f09ca51SMel Gorman return 1; 3947f09ca51SMel Gorman return 0; 395a1e78772SMel Gorman } 396a1e78772SMel Gorman 39769d177c2SAndy Whitcroft static void clear_gigantic_page(struct page *page, 39869d177c2SAndy Whitcroft unsigned long addr, unsigned long sz) 39969d177c2SAndy Whitcroft { 40069d177c2SAndy Whitcroft int i; 40169d177c2SAndy Whitcroft struct page *p = page; 40269d177c2SAndy Whitcroft 40369d177c2SAndy Whitcroft might_sleep(); 40469d177c2SAndy Whitcroft for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { 40569d177c2SAndy Whitcroft cond_resched(); 40669d177c2SAndy Whitcroft clear_user_highpage(p, addr + i * PAGE_SIZE); 40769d177c2SAndy Whitcroft } 40869d177c2SAndy Whitcroft } 409a5516438SAndi Kleen static void clear_huge_page(struct page *page, 410a5516438SAndi Kleen unsigned long addr, unsigned long sz) 41179ac6ba4SDavid Gibson { 41279ac6ba4SDavid Gibson int i; 41379ac6ba4SDavid Gibson 41474dbdd23SAndrea Arcangeli if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { 415ebdd4aeaSHannes Eder clear_gigantic_page(page, addr, sz); 416ebdd4aeaSHannes Eder return; 417ebdd4aeaSHannes Eder } 41869d177c2SAndy Whitcroft 41979ac6ba4SDavid Gibson might_sleep(); 420a5516438SAndi Kleen for (i = 0; i < sz/PAGE_SIZE; i++) { 42179ac6ba4SDavid Gibson cond_resched(); 422281e0e3bSRalf Baechle clear_user_highpage(page + i, addr + i * PAGE_SIZE); 42379ac6ba4SDavid Gibson } 42479ac6ba4SDavid Gibson } 42579ac6ba4SDavid Gibson 4260ebabb41SNaoya Horiguchi static void copy_user_gigantic_page(struct page *dst, struct page *src, 42769d177c2SAndy Whitcroft unsigned long addr, struct vm_area_struct *vma) 42869d177c2SAndy Whitcroft { 42969d177c2SAndy Whitcroft int i; 43069d177c2SAndy Whitcroft struct hstate *h = hstate_vma(vma); 43169d177c2SAndy Whitcroft struct page *dst_base = dst; 43269d177c2SAndy Whitcroft struct page *src_base = src; 4330ebabb41SNaoya Horiguchi 43469d177c2SAndy Whitcroft for (i = 0; i < pages_per_huge_page(h); ) { 43569d177c2SAndy Whitcroft cond_resched(); 43669d177c2SAndy Whitcroft copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 43769d177c2SAndy Whitcroft 43869d177c2SAndy Whitcroft i++; 43969d177c2SAndy Whitcroft dst = mem_map_next(dst, dst_base, i); 44069d177c2SAndy Whitcroft src = mem_map_next(src, src_base, i); 44169d177c2SAndy Whitcroft } 44269d177c2SAndy Whitcroft } 4430ebabb41SNaoya Horiguchi 4440ebabb41SNaoya Horiguchi static void copy_user_huge_page(struct page *dst, struct page *src, 4459de455b2SAtsushi Nemoto unsigned long addr, struct vm_area_struct *vma) 44679ac6ba4SDavid Gibson { 44779ac6ba4SDavid Gibson int i; 448a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 44979ac6ba4SDavid Gibson 450ebdd4aeaSHannes Eder if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 4510ebabb41SNaoya Horiguchi copy_user_gigantic_page(dst, src, addr, vma); 452ebdd4aeaSHannes Eder return; 453ebdd4aeaSHannes Eder } 45469d177c2SAndy Whitcroft 45579ac6ba4SDavid Gibson might_sleep(); 456a5516438SAndi Kleen for (i = 0; i < pages_per_huge_page(h); i++) { 45779ac6ba4SDavid Gibson cond_resched(); 4589de455b2SAtsushi Nemoto copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 45979ac6ba4SDavid Gibson } 46079ac6ba4SDavid Gibson } 46179ac6ba4SDavid Gibson 4620ebabb41SNaoya Horiguchi static void copy_gigantic_page(struct page *dst, struct page *src) 4630ebabb41SNaoya Horiguchi { 4640ebabb41SNaoya Horiguchi int i; 4650ebabb41SNaoya Horiguchi struct hstate *h = page_hstate(src); 4660ebabb41SNaoya Horiguchi struct page *dst_base = dst; 4670ebabb41SNaoya Horiguchi struct page *src_base = src; 4680ebabb41SNaoya Horiguchi 4690ebabb41SNaoya Horiguchi for (i = 0; i < pages_per_huge_page(h); ) { 4700ebabb41SNaoya Horiguchi cond_resched(); 4710ebabb41SNaoya Horiguchi copy_highpage(dst, src); 4720ebabb41SNaoya Horiguchi 4730ebabb41SNaoya Horiguchi i++; 4740ebabb41SNaoya Horiguchi dst = mem_map_next(dst, dst_base, i); 4750ebabb41SNaoya Horiguchi src = mem_map_next(src, src_base, i); 4760ebabb41SNaoya Horiguchi } 4770ebabb41SNaoya Horiguchi } 4780ebabb41SNaoya Horiguchi 4790ebabb41SNaoya Horiguchi void copy_huge_page(struct page *dst, struct page *src) 4800ebabb41SNaoya Horiguchi { 4810ebabb41SNaoya Horiguchi int i; 4820ebabb41SNaoya Horiguchi struct hstate *h = page_hstate(src); 4830ebabb41SNaoya Horiguchi 4840ebabb41SNaoya Horiguchi if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 4850ebabb41SNaoya Horiguchi copy_gigantic_page(dst, src); 4860ebabb41SNaoya Horiguchi return; 4870ebabb41SNaoya Horiguchi } 4880ebabb41SNaoya Horiguchi 4890ebabb41SNaoya Horiguchi might_sleep(); 4900ebabb41SNaoya Horiguchi for (i = 0; i < pages_per_huge_page(h); i++) { 4910ebabb41SNaoya Horiguchi cond_resched(); 4920ebabb41SNaoya Horiguchi copy_highpage(dst + i, src + i); 4930ebabb41SNaoya Horiguchi } 4940ebabb41SNaoya Horiguchi } 4950ebabb41SNaoya Horiguchi 496a5516438SAndi Kleen static void enqueue_huge_page(struct hstate *h, struct page *page) 4971da177e4SLinus Torvalds { 4981da177e4SLinus Torvalds int nid = page_to_nid(page); 499a5516438SAndi Kleen list_add(&page->lru, &h->hugepage_freelists[nid]); 500a5516438SAndi Kleen h->free_huge_pages++; 501a5516438SAndi Kleen h->free_huge_pages_node[nid]++; 5021da177e4SLinus Torvalds } 5031da177e4SLinus Torvalds 504bf50bab2SNaoya Horiguchi static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 505bf50bab2SNaoya Horiguchi { 506bf50bab2SNaoya Horiguchi struct page *page; 507bf50bab2SNaoya Horiguchi 508bf50bab2SNaoya Horiguchi if (list_empty(&h->hugepage_freelists[nid])) 509bf50bab2SNaoya Horiguchi return NULL; 510bf50bab2SNaoya Horiguchi page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 511bf50bab2SNaoya Horiguchi list_del(&page->lru); 512a9869b83SNaoya Horiguchi set_page_refcounted(page); 513bf50bab2SNaoya Horiguchi h->free_huge_pages--; 514bf50bab2SNaoya Horiguchi h->free_huge_pages_node[nid]--; 515bf50bab2SNaoya Horiguchi return page; 516bf50bab2SNaoya Horiguchi } 517bf50bab2SNaoya Horiguchi 518a5516438SAndi Kleen static struct page *dequeue_huge_page_vma(struct hstate *h, 519a5516438SAndi Kleen struct vm_area_struct *vma, 52004f2cbe3SMel Gorman unsigned long address, int avoid_reserve) 5211da177e4SLinus Torvalds { 5221da177e4SLinus Torvalds struct page *page = NULL; 523480eccf9SLee Schermerhorn struct mempolicy *mpol; 52419770b32SMel Gorman nodemask_t *nodemask; 525c0ff7453SMiao Xie struct zonelist *zonelist; 526dd1a239fSMel Gorman struct zone *zone; 527dd1a239fSMel Gorman struct zoneref *z; 5281da177e4SLinus Torvalds 529c0ff7453SMiao Xie get_mems_allowed(); 530c0ff7453SMiao Xie zonelist = huge_zonelist(vma, address, 531c0ff7453SMiao Xie htlb_alloc_mask, &mpol, &nodemask); 532a1e78772SMel Gorman /* 533a1e78772SMel Gorman * A child process with MAP_PRIVATE mappings created by their parent 534a1e78772SMel Gorman * have no page reserves. This check ensures that reservations are 535a1e78772SMel Gorman * not "stolen". The child may still get SIGKILLed 536a1e78772SMel Gorman */ 5377f09ca51SMel Gorman if (!vma_has_reserves(vma) && 538a5516438SAndi Kleen h->free_huge_pages - h->resv_huge_pages == 0) 539c0ff7453SMiao Xie goto err; 540a1e78772SMel Gorman 54104f2cbe3SMel Gorman /* If reserves cannot be used, ensure enough pages are in the pool */ 542a5516438SAndi Kleen if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 543c0ff7453SMiao Xie goto err;; 54404f2cbe3SMel Gorman 54519770b32SMel Gorman for_each_zone_zonelist_nodemask(zone, z, zonelist, 54619770b32SMel Gorman MAX_NR_ZONES - 1, nodemask) { 547bf50bab2SNaoya Horiguchi if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { 548bf50bab2SNaoya Horiguchi page = dequeue_huge_page_node(h, zone_to_nid(zone)); 549bf50bab2SNaoya Horiguchi if (page) { 55004f2cbe3SMel Gorman if (!avoid_reserve) 551a5516438SAndi Kleen decrement_hugepage_resv_vma(h, vma); 5525ab3ee7bSKen Chen break; 5531da177e4SLinus Torvalds } 5543abf7afdSAndrew Morton } 555bf50bab2SNaoya Horiguchi } 556c0ff7453SMiao Xie err: 55752cd3b07SLee Schermerhorn mpol_cond_put(mpol); 558c0ff7453SMiao Xie put_mems_allowed(); 5591da177e4SLinus Torvalds return page; 5601da177e4SLinus Torvalds } 5611da177e4SLinus Torvalds 562a5516438SAndi Kleen static void update_and_free_page(struct hstate *h, struct page *page) 5636af2acb6SAdam Litke { 5646af2acb6SAdam Litke int i; 565a5516438SAndi Kleen 56618229df5SAndy Whitcroft VM_BUG_ON(h->order >= MAX_ORDER); 56718229df5SAndy Whitcroft 568a5516438SAndi Kleen h->nr_huge_pages--; 569a5516438SAndi Kleen h->nr_huge_pages_node[page_to_nid(page)]--; 570a5516438SAndi Kleen for (i = 0; i < pages_per_huge_page(h); i++) { 5716af2acb6SAdam Litke page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 5726af2acb6SAdam Litke 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 5736af2acb6SAdam Litke 1 << PG_private | 1<< PG_writeback); 5746af2acb6SAdam Litke } 5756af2acb6SAdam Litke set_compound_page_dtor(page, NULL); 5766af2acb6SAdam Litke set_page_refcounted(page); 5777f2e9525SGerald Schaefer arch_release_hugepage(page); 578a5516438SAndi Kleen __free_pages(page, huge_page_order(h)); 5796af2acb6SAdam Litke } 5806af2acb6SAdam Litke 581e5ff2159SAndi Kleen struct hstate *size_to_hstate(unsigned long size) 582e5ff2159SAndi Kleen { 583e5ff2159SAndi Kleen struct hstate *h; 584e5ff2159SAndi Kleen 585e5ff2159SAndi Kleen for_each_hstate(h) { 586e5ff2159SAndi Kleen if (huge_page_size(h) == size) 587e5ff2159SAndi Kleen return h; 588e5ff2159SAndi Kleen } 589e5ff2159SAndi Kleen return NULL; 590e5ff2159SAndi Kleen } 591e5ff2159SAndi Kleen 59227a85ef1SDavid Gibson static void free_huge_page(struct page *page) 59327a85ef1SDavid Gibson { 594a5516438SAndi Kleen /* 595a5516438SAndi Kleen * Can't pass hstate in here because it is called from the 596a5516438SAndi Kleen * compound page destructor. 597a5516438SAndi Kleen */ 598e5ff2159SAndi Kleen struct hstate *h = page_hstate(page); 5997893d1d5SAdam Litke int nid = page_to_nid(page); 600c79fb75eSAdam Litke struct address_space *mapping; 60127a85ef1SDavid Gibson 602c79fb75eSAdam Litke mapping = (struct address_space *) page_private(page); 603e5df70abSAndy Whitcroft set_page_private(page, 0); 60423be7468SMel Gorman page->mapping = NULL; 6057893d1d5SAdam Litke BUG_ON(page_count(page)); 6060fe6e20bSNaoya Horiguchi BUG_ON(page_mapcount(page)); 60727a85ef1SDavid Gibson INIT_LIST_HEAD(&page->lru); 60827a85ef1SDavid Gibson 60927a85ef1SDavid Gibson spin_lock(&hugetlb_lock); 610aa888a74SAndi Kleen if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 611a5516438SAndi Kleen update_and_free_page(h, page); 612a5516438SAndi Kleen h->surplus_huge_pages--; 613a5516438SAndi Kleen h->surplus_huge_pages_node[nid]--; 6147893d1d5SAdam Litke } else { 615a5516438SAndi Kleen enqueue_huge_page(h, page); 6167893d1d5SAdam Litke } 61727a85ef1SDavid Gibson spin_unlock(&hugetlb_lock); 618c79fb75eSAdam Litke if (mapping) 6199a119c05SAdam Litke hugetlb_put_quota(mapping, 1); 62027a85ef1SDavid Gibson } 62127a85ef1SDavid Gibson 622a5516438SAndi Kleen static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 623b7ba30c6SAndi Kleen { 624b7ba30c6SAndi Kleen set_compound_page_dtor(page, free_huge_page); 625b7ba30c6SAndi Kleen spin_lock(&hugetlb_lock); 626a5516438SAndi Kleen h->nr_huge_pages++; 627a5516438SAndi Kleen h->nr_huge_pages_node[nid]++; 628b7ba30c6SAndi Kleen spin_unlock(&hugetlb_lock); 629b7ba30c6SAndi Kleen put_page(page); /* free it into the hugepage allocator */ 630b7ba30c6SAndi Kleen } 631b7ba30c6SAndi Kleen 63220a0307cSWu Fengguang static void prep_compound_gigantic_page(struct page *page, unsigned long order) 63320a0307cSWu Fengguang { 63420a0307cSWu Fengguang int i; 63520a0307cSWu Fengguang int nr_pages = 1 << order; 63620a0307cSWu Fengguang struct page *p = page + 1; 63720a0307cSWu Fengguang 63820a0307cSWu Fengguang /* we rely on prep_new_huge_page to set the destructor */ 63920a0307cSWu Fengguang set_compound_order(page, order); 64020a0307cSWu Fengguang __SetPageHead(page); 64120a0307cSWu Fengguang for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 64220a0307cSWu Fengguang __SetPageTail(p); 64320a0307cSWu Fengguang p->first_page = page; 64420a0307cSWu Fengguang } 64520a0307cSWu Fengguang } 64620a0307cSWu Fengguang 64720a0307cSWu Fengguang int PageHuge(struct page *page) 64820a0307cSWu Fengguang { 64920a0307cSWu Fengguang compound_page_dtor *dtor; 65020a0307cSWu Fengguang 65120a0307cSWu Fengguang if (!PageCompound(page)) 65220a0307cSWu Fengguang return 0; 65320a0307cSWu Fengguang 65420a0307cSWu Fengguang page = compound_head(page); 65520a0307cSWu Fengguang dtor = get_compound_page_dtor(page); 65620a0307cSWu Fengguang 65720a0307cSWu Fengguang return dtor == free_huge_page; 65820a0307cSWu Fengguang } 65920a0307cSWu Fengguang 66043131e14SNaoya Horiguchi EXPORT_SYMBOL_GPL(PageHuge); 66143131e14SNaoya Horiguchi 662a5516438SAndi Kleen static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 6631da177e4SLinus Torvalds { 6641da177e4SLinus Torvalds struct page *page; 665f96efd58SJoe Jin 666aa888a74SAndi Kleen if (h->order >= MAX_ORDER) 667aa888a74SAndi Kleen return NULL; 668aa888a74SAndi Kleen 6696484eb3eSMel Gorman page = alloc_pages_exact_node(nid, 670551883aeSNishanth Aravamudan htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 671551883aeSNishanth Aravamudan __GFP_REPEAT|__GFP_NOWARN, 672a5516438SAndi Kleen huge_page_order(h)); 6731da177e4SLinus Torvalds if (page) { 6747f2e9525SGerald Schaefer if (arch_prepare_hugepage(page)) { 675caff3a2cSGerald Schaefer __free_pages(page, huge_page_order(h)); 6767b8ee84dSHarvey Harrison return NULL; 6777f2e9525SGerald Schaefer } 678a5516438SAndi Kleen prep_new_huge_page(h, page, nid); 6791da177e4SLinus Torvalds } 68063b4613cSNishanth Aravamudan 68163b4613cSNishanth Aravamudan return page; 68263b4613cSNishanth Aravamudan } 68363b4613cSNishanth Aravamudan 6845ced66c9SAndi Kleen /* 6856ae11b27SLee Schermerhorn * common helper functions for hstate_next_node_to_{alloc|free}. 6866ae11b27SLee Schermerhorn * We may have allocated or freed a huge page based on a different 6876ae11b27SLee Schermerhorn * nodes_allowed previously, so h->next_node_to_{alloc|free} might 6886ae11b27SLee Schermerhorn * be outside of *nodes_allowed. Ensure that we use an allowed 6896ae11b27SLee Schermerhorn * node for alloc or free. 6909a76db09SLee Schermerhorn */ 6916ae11b27SLee Schermerhorn static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 6929a76db09SLee Schermerhorn { 6936ae11b27SLee Schermerhorn nid = next_node(nid, *nodes_allowed); 6949a76db09SLee Schermerhorn if (nid == MAX_NUMNODES) 6956ae11b27SLee Schermerhorn nid = first_node(*nodes_allowed); 6969a76db09SLee Schermerhorn VM_BUG_ON(nid >= MAX_NUMNODES); 6979a76db09SLee Schermerhorn 6989a76db09SLee Schermerhorn return nid; 6999a76db09SLee Schermerhorn } 7009a76db09SLee Schermerhorn 7016ae11b27SLee Schermerhorn static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 7025ced66c9SAndi Kleen { 7036ae11b27SLee Schermerhorn if (!node_isset(nid, *nodes_allowed)) 7046ae11b27SLee Schermerhorn nid = next_node_allowed(nid, nodes_allowed); 7059a76db09SLee Schermerhorn return nid; 7065ced66c9SAndi Kleen } 7075ced66c9SAndi Kleen 7086ae11b27SLee Schermerhorn /* 7096ae11b27SLee Schermerhorn * returns the previously saved node ["this node"] from which to 7106ae11b27SLee Schermerhorn * allocate a persistent huge page for the pool and advance the 7116ae11b27SLee Schermerhorn * next node from which to allocate, handling wrap at end of node 7126ae11b27SLee Schermerhorn * mask. 7136ae11b27SLee Schermerhorn */ 7146ae11b27SLee Schermerhorn static int hstate_next_node_to_alloc(struct hstate *h, 7156ae11b27SLee Schermerhorn nodemask_t *nodes_allowed) 7166ae11b27SLee Schermerhorn { 7176ae11b27SLee Schermerhorn int nid; 7186ae11b27SLee Schermerhorn 7196ae11b27SLee Schermerhorn VM_BUG_ON(!nodes_allowed); 7206ae11b27SLee Schermerhorn 7216ae11b27SLee Schermerhorn nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 7226ae11b27SLee Schermerhorn h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 7236ae11b27SLee Schermerhorn 7246ae11b27SLee Schermerhorn return nid; 7256ae11b27SLee Schermerhorn } 7266ae11b27SLee Schermerhorn 7276ae11b27SLee Schermerhorn static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 72863b4613cSNishanth Aravamudan { 72963b4613cSNishanth Aravamudan struct page *page; 73063b4613cSNishanth Aravamudan int start_nid; 73163b4613cSNishanth Aravamudan int next_nid; 73263b4613cSNishanth Aravamudan int ret = 0; 73363b4613cSNishanth Aravamudan 7346ae11b27SLee Schermerhorn start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 735e8c5c824SLee Schermerhorn next_nid = start_nid; 73663b4613cSNishanth Aravamudan 73763b4613cSNishanth Aravamudan do { 738e8c5c824SLee Schermerhorn page = alloc_fresh_huge_page_node(h, next_nid); 7399a76db09SLee Schermerhorn if (page) { 74063b4613cSNishanth Aravamudan ret = 1; 7419a76db09SLee Schermerhorn break; 7429a76db09SLee Schermerhorn } 7436ae11b27SLee Schermerhorn next_nid = hstate_next_node_to_alloc(h, nodes_allowed); 7449a76db09SLee Schermerhorn } while (next_nid != start_nid); 74563b4613cSNishanth Aravamudan 7463b116300SAdam Litke if (ret) 7473b116300SAdam Litke count_vm_event(HTLB_BUDDY_PGALLOC); 7483b116300SAdam Litke else 7493b116300SAdam Litke count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 7503b116300SAdam Litke 75163b4613cSNishanth Aravamudan return ret; 7521da177e4SLinus Torvalds } 7531da177e4SLinus Torvalds 754e8c5c824SLee Schermerhorn /* 7556ae11b27SLee Schermerhorn * helper for free_pool_huge_page() - return the previously saved 7566ae11b27SLee Schermerhorn * node ["this node"] from which to free a huge page. Advance the 7576ae11b27SLee Schermerhorn * next node id whether or not we find a free huge page to free so 7586ae11b27SLee Schermerhorn * that the next attempt to free addresses the next node. 759e8c5c824SLee Schermerhorn */ 7606ae11b27SLee Schermerhorn static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 761e8c5c824SLee Schermerhorn { 7626ae11b27SLee Schermerhorn int nid; 7639a76db09SLee Schermerhorn 7646ae11b27SLee Schermerhorn VM_BUG_ON(!nodes_allowed); 7656ae11b27SLee Schermerhorn 7666ae11b27SLee Schermerhorn nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 7676ae11b27SLee Schermerhorn h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 7686ae11b27SLee Schermerhorn 7699a76db09SLee Schermerhorn return nid; 770e8c5c824SLee Schermerhorn } 771e8c5c824SLee Schermerhorn 772e8c5c824SLee Schermerhorn /* 773e8c5c824SLee Schermerhorn * Free huge page from pool from next node to free. 774e8c5c824SLee Schermerhorn * Attempt to keep persistent huge pages more or less 775e8c5c824SLee Schermerhorn * balanced over allowed nodes. 776e8c5c824SLee Schermerhorn * Called with hugetlb_lock locked. 777e8c5c824SLee Schermerhorn */ 7786ae11b27SLee Schermerhorn static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 7796ae11b27SLee Schermerhorn bool acct_surplus) 780e8c5c824SLee Schermerhorn { 781e8c5c824SLee Schermerhorn int start_nid; 782e8c5c824SLee Schermerhorn int next_nid; 783e8c5c824SLee Schermerhorn int ret = 0; 784e8c5c824SLee Schermerhorn 7856ae11b27SLee Schermerhorn start_nid = hstate_next_node_to_free(h, nodes_allowed); 786e8c5c824SLee Schermerhorn next_nid = start_nid; 787e8c5c824SLee Schermerhorn 788e8c5c824SLee Schermerhorn do { 789685f3457SLee Schermerhorn /* 790685f3457SLee Schermerhorn * If we're returning unused surplus pages, only examine 791685f3457SLee Schermerhorn * nodes with surplus pages. 792685f3457SLee Schermerhorn */ 793685f3457SLee Schermerhorn if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && 794685f3457SLee Schermerhorn !list_empty(&h->hugepage_freelists[next_nid])) { 795e8c5c824SLee Schermerhorn struct page *page = 796e8c5c824SLee Schermerhorn list_entry(h->hugepage_freelists[next_nid].next, 797e8c5c824SLee Schermerhorn struct page, lru); 798e8c5c824SLee Schermerhorn list_del(&page->lru); 799e8c5c824SLee Schermerhorn h->free_huge_pages--; 800e8c5c824SLee Schermerhorn h->free_huge_pages_node[next_nid]--; 801685f3457SLee Schermerhorn if (acct_surplus) { 802685f3457SLee Schermerhorn h->surplus_huge_pages--; 803685f3457SLee Schermerhorn h->surplus_huge_pages_node[next_nid]--; 804685f3457SLee Schermerhorn } 805e8c5c824SLee Schermerhorn update_and_free_page(h, page); 806e8c5c824SLee Schermerhorn ret = 1; 8079a76db09SLee Schermerhorn break; 808e8c5c824SLee Schermerhorn } 8096ae11b27SLee Schermerhorn next_nid = hstate_next_node_to_free(h, nodes_allowed); 8109a76db09SLee Schermerhorn } while (next_nid != start_nid); 811e8c5c824SLee Schermerhorn 812e8c5c824SLee Schermerhorn return ret; 813e8c5c824SLee Schermerhorn } 814e8c5c824SLee Schermerhorn 815bf50bab2SNaoya Horiguchi static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 8167893d1d5SAdam Litke { 8177893d1d5SAdam Litke struct page *page; 818bf50bab2SNaoya Horiguchi unsigned int r_nid; 8197893d1d5SAdam Litke 820aa888a74SAndi Kleen if (h->order >= MAX_ORDER) 821aa888a74SAndi Kleen return NULL; 822aa888a74SAndi Kleen 823d1c3fb1fSNishanth Aravamudan /* 824d1c3fb1fSNishanth Aravamudan * Assume we will successfully allocate the surplus page to 825d1c3fb1fSNishanth Aravamudan * prevent racing processes from causing the surplus to exceed 826d1c3fb1fSNishanth Aravamudan * overcommit 827d1c3fb1fSNishanth Aravamudan * 828d1c3fb1fSNishanth Aravamudan * This however introduces a different race, where a process B 829d1c3fb1fSNishanth Aravamudan * tries to grow the static hugepage pool while alloc_pages() is 830d1c3fb1fSNishanth Aravamudan * called by process A. B will only examine the per-node 831d1c3fb1fSNishanth Aravamudan * counters in determining if surplus huge pages can be 832d1c3fb1fSNishanth Aravamudan * converted to normal huge pages in adjust_pool_surplus(). A 833d1c3fb1fSNishanth Aravamudan * won't be able to increment the per-node counter, until the 834d1c3fb1fSNishanth Aravamudan * lock is dropped by B, but B doesn't drop hugetlb_lock until 835d1c3fb1fSNishanth Aravamudan * no more huge pages can be converted from surplus to normal 836d1c3fb1fSNishanth Aravamudan * state (and doesn't try to convert again). Thus, we have a 837d1c3fb1fSNishanth Aravamudan * case where a surplus huge page exists, the pool is grown, and 838d1c3fb1fSNishanth Aravamudan * the surplus huge page still exists after, even though it 839d1c3fb1fSNishanth Aravamudan * should just have been converted to a normal huge page. This 840d1c3fb1fSNishanth Aravamudan * does not leak memory, though, as the hugepage will be freed 841d1c3fb1fSNishanth Aravamudan * once it is out of use. It also does not allow the counters to 842d1c3fb1fSNishanth Aravamudan * go out of whack in adjust_pool_surplus() as we don't modify 843d1c3fb1fSNishanth Aravamudan * the node values until we've gotten the hugepage and only the 844d1c3fb1fSNishanth Aravamudan * per-node value is checked there. 845d1c3fb1fSNishanth Aravamudan */ 846d1c3fb1fSNishanth Aravamudan spin_lock(&hugetlb_lock); 847a5516438SAndi Kleen if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 848d1c3fb1fSNishanth Aravamudan spin_unlock(&hugetlb_lock); 849d1c3fb1fSNishanth Aravamudan return NULL; 850d1c3fb1fSNishanth Aravamudan } else { 851a5516438SAndi Kleen h->nr_huge_pages++; 852a5516438SAndi Kleen h->surplus_huge_pages++; 853d1c3fb1fSNishanth Aravamudan } 854d1c3fb1fSNishanth Aravamudan spin_unlock(&hugetlb_lock); 855d1c3fb1fSNishanth Aravamudan 856bf50bab2SNaoya Horiguchi if (nid == NUMA_NO_NODE) 857551883aeSNishanth Aravamudan page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 858551883aeSNishanth Aravamudan __GFP_REPEAT|__GFP_NOWARN, 859a5516438SAndi Kleen huge_page_order(h)); 860bf50bab2SNaoya Horiguchi else 861bf50bab2SNaoya Horiguchi page = alloc_pages_exact_node(nid, 862bf50bab2SNaoya Horiguchi htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 863bf50bab2SNaoya Horiguchi __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 864d1c3fb1fSNishanth Aravamudan 865caff3a2cSGerald Schaefer if (page && arch_prepare_hugepage(page)) { 866caff3a2cSGerald Schaefer __free_pages(page, huge_page_order(h)); 867caff3a2cSGerald Schaefer return NULL; 868caff3a2cSGerald Schaefer } 869caff3a2cSGerald Schaefer 8707893d1d5SAdam Litke spin_lock(&hugetlb_lock); 871d1c3fb1fSNishanth Aravamudan if (page) { 872bf50bab2SNaoya Horiguchi r_nid = page_to_nid(page); 873d1c3fb1fSNishanth Aravamudan set_compound_page_dtor(page, free_huge_page); 874d1c3fb1fSNishanth Aravamudan /* 875d1c3fb1fSNishanth Aravamudan * We incremented the global counters already 876d1c3fb1fSNishanth Aravamudan */ 877bf50bab2SNaoya Horiguchi h->nr_huge_pages_node[r_nid]++; 878bf50bab2SNaoya Horiguchi h->surplus_huge_pages_node[r_nid]++; 8793b116300SAdam Litke __count_vm_event(HTLB_BUDDY_PGALLOC); 880d1c3fb1fSNishanth Aravamudan } else { 881a5516438SAndi Kleen h->nr_huge_pages--; 882a5516438SAndi Kleen h->surplus_huge_pages--; 8833b116300SAdam Litke __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 8847893d1d5SAdam Litke } 885d1c3fb1fSNishanth Aravamudan spin_unlock(&hugetlb_lock); 8867893d1d5SAdam Litke 8877893d1d5SAdam Litke return page; 8887893d1d5SAdam Litke } 8897893d1d5SAdam Litke 890e4e574b7SAdam Litke /* 891bf50bab2SNaoya Horiguchi * This allocation function is useful in the context where vma is irrelevant. 892bf50bab2SNaoya Horiguchi * E.g. soft-offlining uses this function because it only cares physical 893bf50bab2SNaoya Horiguchi * address of error page. 894bf50bab2SNaoya Horiguchi */ 895bf50bab2SNaoya Horiguchi struct page *alloc_huge_page_node(struct hstate *h, int nid) 896bf50bab2SNaoya Horiguchi { 897bf50bab2SNaoya Horiguchi struct page *page; 898bf50bab2SNaoya Horiguchi 899bf50bab2SNaoya Horiguchi spin_lock(&hugetlb_lock); 900bf50bab2SNaoya Horiguchi page = dequeue_huge_page_node(h, nid); 901bf50bab2SNaoya Horiguchi spin_unlock(&hugetlb_lock); 902bf50bab2SNaoya Horiguchi 903bf50bab2SNaoya Horiguchi if (!page) 904bf50bab2SNaoya Horiguchi page = alloc_buddy_huge_page(h, nid); 905bf50bab2SNaoya Horiguchi 906bf50bab2SNaoya Horiguchi return page; 907bf50bab2SNaoya Horiguchi } 908bf50bab2SNaoya Horiguchi 909bf50bab2SNaoya Horiguchi /* 910e4e574b7SAdam Litke * Increase the hugetlb pool such that it can accomodate a reservation 911e4e574b7SAdam Litke * of size 'delta'. 912e4e574b7SAdam Litke */ 913a5516438SAndi Kleen static int gather_surplus_pages(struct hstate *h, int delta) 914e4e574b7SAdam Litke { 915e4e574b7SAdam Litke struct list_head surplus_list; 916e4e574b7SAdam Litke struct page *page, *tmp; 917e4e574b7SAdam Litke int ret, i; 918e4e574b7SAdam Litke int needed, allocated; 919e4e574b7SAdam Litke 920a5516438SAndi Kleen needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 921ac09b3a1SAdam Litke if (needed <= 0) { 922a5516438SAndi Kleen h->resv_huge_pages += delta; 923e4e574b7SAdam Litke return 0; 924ac09b3a1SAdam Litke } 925e4e574b7SAdam Litke 926e4e574b7SAdam Litke allocated = 0; 927e4e574b7SAdam Litke INIT_LIST_HEAD(&surplus_list); 928e4e574b7SAdam Litke 929e4e574b7SAdam Litke ret = -ENOMEM; 930e4e574b7SAdam Litke retry: 931e4e574b7SAdam Litke spin_unlock(&hugetlb_lock); 932e4e574b7SAdam Litke for (i = 0; i < needed; i++) { 933bf50bab2SNaoya Horiguchi page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 934a9869b83SNaoya Horiguchi if (!page) 935e4e574b7SAdam Litke /* 936e4e574b7SAdam Litke * We were not able to allocate enough pages to 937e4e574b7SAdam Litke * satisfy the entire reservation so we free what 938e4e574b7SAdam Litke * we've allocated so far. 939e4e574b7SAdam Litke */ 940e4e574b7SAdam Litke goto free; 941e4e574b7SAdam Litke 942e4e574b7SAdam Litke list_add(&page->lru, &surplus_list); 943e4e574b7SAdam Litke } 944e4e574b7SAdam Litke allocated += needed; 945e4e574b7SAdam Litke 946e4e574b7SAdam Litke /* 947e4e574b7SAdam Litke * After retaking hugetlb_lock, we need to recalculate 'needed' 948e4e574b7SAdam Litke * because either resv_huge_pages or free_huge_pages may have changed. 949e4e574b7SAdam Litke */ 950e4e574b7SAdam Litke spin_lock(&hugetlb_lock); 951a5516438SAndi Kleen needed = (h->resv_huge_pages + delta) - 952a5516438SAndi Kleen (h->free_huge_pages + allocated); 953e4e574b7SAdam Litke if (needed > 0) 954e4e574b7SAdam Litke goto retry; 955e4e574b7SAdam Litke 956e4e574b7SAdam Litke /* 957e4e574b7SAdam Litke * The surplus_list now contains _at_least_ the number of extra pages 958e4e574b7SAdam Litke * needed to accomodate the reservation. Add the appropriate number 959e4e574b7SAdam Litke * of pages to the hugetlb pool and free the extras back to the buddy 960ac09b3a1SAdam Litke * allocator. Commit the entire reservation here to prevent another 961ac09b3a1SAdam Litke * process from stealing the pages as they are added to the pool but 962ac09b3a1SAdam Litke * before they are reserved. 963e4e574b7SAdam Litke */ 964e4e574b7SAdam Litke needed += allocated; 965a5516438SAndi Kleen h->resv_huge_pages += delta; 966e4e574b7SAdam Litke ret = 0; 967a9869b83SNaoya Horiguchi 968a9869b83SNaoya Horiguchi spin_unlock(&hugetlb_lock); 96919fc3f0aSAdam Litke /* Free the needed pages to the hugetlb pool */ 97019fc3f0aSAdam Litke list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 97119fc3f0aSAdam Litke if ((--needed) < 0) 97219fc3f0aSAdam Litke break; 97319fc3f0aSAdam Litke list_del(&page->lru); 974a9869b83SNaoya Horiguchi /* 975a9869b83SNaoya Horiguchi * This page is now managed by the hugetlb allocator and has 976a9869b83SNaoya Horiguchi * no users -- drop the buddy allocator's reference. 977a9869b83SNaoya Horiguchi */ 978a9869b83SNaoya Horiguchi put_page_testzero(page); 979a9869b83SNaoya Horiguchi VM_BUG_ON(page_count(page)); 980a5516438SAndi Kleen enqueue_huge_page(h, page); 98119fc3f0aSAdam Litke } 98219fc3f0aSAdam Litke 98319fc3f0aSAdam Litke /* Free unnecessary surplus pages to the buddy allocator */ 984a9869b83SNaoya Horiguchi free: 98519fc3f0aSAdam Litke if (!list_empty(&surplus_list)) { 986e4e574b7SAdam Litke list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 987e4e574b7SAdam Litke list_del(&page->lru); 988a9869b83SNaoya Horiguchi put_page(page); 989a9869b83SNaoya Horiguchi } 990af767cbdSAdam Litke } 99119fc3f0aSAdam Litke spin_lock(&hugetlb_lock); 992e4e574b7SAdam Litke 993e4e574b7SAdam Litke return ret; 994e4e574b7SAdam Litke } 995e4e574b7SAdam Litke 996e4e574b7SAdam Litke /* 997e4e574b7SAdam Litke * When releasing a hugetlb pool reservation, any surplus pages that were 998e4e574b7SAdam Litke * allocated to satisfy the reservation must be explicitly freed if they were 999e4e574b7SAdam Litke * never used. 1000685f3457SLee Schermerhorn * Called with hugetlb_lock held. 1001e4e574b7SAdam Litke */ 1002a5516438SAndi Kleen static void return_unused_surplus_pages(struct hstate *h, 1003a5516438SAndi Kleen unsigned long unused_resv_pages) 1004e4e574b7SAdam Litke { 1005e4e574b7SAdam Litke unsigned long nr_pages; 1006e4e574b7SAdam Litke 1007ac09b3a1SAdam Litke /* Uncommit the reservation */ 1008a5516438SAndi Kleen h->resv_huge_pages -= unused_resv_pages; 1009ac09b3a1SAdam Litke 1010aa888a74SAndi Kleen /* Cannot return gigantic pages currently */ 1011aa888a74SAndi Kleen if (h->order >= MAX_ORDER) 1012aa888a74SAndi Kleen return; 1013aa888a74SAndi Kleen 1014a5516438SAndi Kleen nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1015e4e574b7SAdam Litke 1016685f3457SLee Schermerhorn /* 1017685f3457SLee Schermerhorn * We want to release as many surplus pages as possible, spread 10189b5e5d0fSLee Schermerhorn * evenly across all nodes with memory. Iterate across these nodes 10199b5e5d0fSLee Schermerhorn * until we can no longer free unreserved surplus pages. This occurs 10209b5e5d0fSLee Schermerhorn * when the nodes with surplus pages have no free pages. 10219b5e5d0fSLee Schermerhorn * free_pool_huge_page() will balance the the freed pages across the 10229b5e5d0fSLee Schermerhorn * on-line nodes with memory and will handle the hstate accounting. 1023685f3457SLee Schermerhorn */ 1024685f3457SLee Schermerhorn while (nr_pages--) { 10259b5e5d0fSLee Schermerhorn if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) 1026685f3457SLee Schermerhorn break; 1027e4e574b7SAdam Litke } 1028e4e574b7SAdam Litke } 1029e4e574b7SAdam Litke 1030c37f9fb1SAndy Whitcroft /* 1031c37f9fb1SAndy Whitcroft * Determine if the huge page at addr within the vma has an associated 1032c37f9fb1SAndy Whitcroft * reservation. Where it does not we will need to logically increase 1033c37f9fb1SAndy Whitcroft * reservation and actually increase quota before an allocation can occur. 1034c37f9fb1SAndy Whitcroft * Where any new reservation would be required the reservation change is 1035c37f9fb1SAndy Whitcroft * prepared, but not committed. Once the page has been quota'd allocated 1036c37f9fb1SAndy Whitcroft * an instantiated the change should be committed via vma_commit_reservation. 1037c37f9fb1SAndy Whitcroft * No action is required on failure. 1038c37f9fb1SAndy Whitcroft */ 1039e2f17d94SRoel Kluin static long vma_needs_reservation(struct hstate *h, 1040a5516438SAndi Kleen struct vm_area_struct *vma, unsigned long addr) 1041c37f9fb1SAndy Whitcroft { 1042c37f9fb1SAndy Whitcroft struct address_space *mapping = vma->vm_file->f_mapping; 1043c37f9fb1SAndy Whitcroft struct inode *inode = mapping->host; 1044c37f9fb1SAndy Whitcroft 1045f83a275dSMel Gorman if (vma->vm_flags & VM_MAYSHARE) { 1046a5516438SAndi Kleen pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1047c37f9fb1SAndy Whitcroft return region_chg(&inode->i_mapping->private_list, 1048c37f9fb1SAndy Whitcroft idx, idx + 1); 1049c37f9fb1SAndy Whitcroft 105084afd99bSAndy Whitcroft } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1051c37f9fb1SAndy Whitcroft return 1; 1052c37f9fb1SAndy Whitcroft 105384afd99bSAndy Whitcroft } else { 1054e2f17d94SRoel Kluin long err; 1055a5516438SAndi Kleen pgoff_t idx = vma_hugecache_offset(h, vma, addr); 105684afd99bSAndy Whitcroft struct resv_map *reservations = vma_resv_map(vma); 105784afd99bSAndy Whitcroft 105884afd99bSAndy Whitcroft err = region_chg(&reservations->regions, idx, idx + 1); 105984afd99bSAndy Whitcroft if (err < 0) 106084afd99bSAndy Whitcroft return err; 1061c37f9fb1SAndy Whitcroft return 0; 1062c37f9fb1SAndy Whitcroft } 106384afd99bSAndy Whitcroft } 1064a5516438SAndi Kleen static void vma_commit_reservation(struct hstate *h, 1065a5516438SAndi Kleen struct vm_area_struct *vma, unsigned long addr) 1066c37f9fb1SAndy Whitcroft { 1067c37f9fb1SAndy Whitcroft struct address_space *mapping = vma->vm_file->f_mapping; 1068c37f9fb1SAndy Whitcroft struct inode *inode = mapping->host; 1069c37f9fb1SAndy Whitcroft 1070f83a275dSMel Gorman if (vma->vm_flags & VM_MAYSHARE) { 1071a5516438SAndi Kleen pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1072c37f9fb1SAndy Whitcroft region_add(&inode->i_mapping->private_list, idx, idx + 1); 107384afd99bSAndy Whitcroft 107484afd99bSAndy Whitcroft } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1075a5516438SAndi Kleen pgoff_t idx = vma_hugecache_offset(h, vma, addr); 107684afd99bSAndy Whitcroft struct resv_map *reservations = vma_resv_map(vma); 107784afd99bSAndy Whitcroft 107884afd99bSAndy Whitcroft /* Mark this page used in the map. */ 107984afd99bSAndy Whitcroft region_add(&reservations->regions, idx, idx + 1); 1080c37f9fb1SAndy Whitcroft } 1081c37f9fb1SAndy Whitcroft } 1082c37f9fb1SAndy Whitcroft 1083348ea204SAdam Litke static struct page *alloc_huge_page(struct vm_area_struct *vma, 108404f2cbe3SMel Gorman unsigned long addr, int avoid_reserve) 1085348ea204SAdam Litke { 1086a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 1087348ea204SAdam Litke struct page *page; 10882fc39cecSAdam Litke struct address_space *mapping = vma->vm_file->f_mapping; 1089a1e78772SMel Gorman struct inode *inode = mapping->host; 1090e2f17d94SRoel Kluin long chg; 10912fc39cecSAdam Litke 1092a1e78772SMel Gorman /* 1093a1e78772SMel Gorman * Processes that did not create the mapping will have no reserves and 1094a1e78772SMel Gorman * will not have accounted against quota. Check that the quota can be 1095a1e78772SMel Gorman * made before satisfying the allocation 1096c37f9fb1SAndy Whitcroft * MAP_NORESERVE mappings may also need pages and quota allocated 1097c37f9fb1SAndy Whitcroft * if no reserve mapping overlaps. 1098a1e78772SMel Gorman */ 1099a5516438SAndi Kleen chg = vma_needs_reservation(h, vma, addr); 1100c37f9fb1SAndy Whitcroft if (chg < 0) 1101c37f9fb1SAndy Whitcroft return ERR_PTR(chg); 1102c37f9fb1SAndy Whitcroft if (chg) 1103a1e78772SMel Gorman if (hugetlb_get_quota(inode->i_mapping, chg)) 1104a1e78772SMel Gorman return ERR_PTR(-ENOSPC); 110590d8b7e6SAdam Litke 1106a1e78772SMel Gorman spin_lock(&hugetlb_lock); 1107a5516438SAndi Kleen page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1108a1e78772SMel Gorman spin_unlock(&hugetlb_lock); 1109a1e78772SMel Gorman 1110a1e78772SMel Gorman if (!page) { 1111bf50bab2SNaoya Horiguchi page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1112a1e78772SMel Gorman if (!page) { 1113a1e78772SMel Gorman hugetlb_put_quota(inode->i_mapping, chg); 11144a6018f7SMel Gorman return ERR_PTR(-VM_FAULT_SIGBUS); 1115a1e78772SMel Gorman } 1116a1e78772SMel Gorman } 1117a1e78772SMel Gorman 11182fc39cecSAdam Litke set_page_private(page, (unsigned long) mapping); 1119a1e78772SMel Gorman 1120a5516438SAndi Kleen vma_commit_reservation(h, vma, addr); 1121c37f9fb1SAndy Whitcroft 11227893d1d5SAdam Litke return page; 1123b45b5bd6SDavid Gibson } 1124b45b5bd6SDavid Gibson 112591f47662SCyrill Gorcunov int __weak alloc_bootmem_huge_page(struct hstate *h) 1126aa888a74SAndi Kleen { 1127aa888a74SAndi Kleen struct huge_bootmem_page *m; 11289b5e5d0fSLee Schermerhorn int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 1129aa888a74SAndi Kleen 1130aa888a74SAndi Kleen while (nr_nodes) { 1131aa888a74SAndi Kleen void *addr; 1132aa888a74SAndi Kleen 1133aa888a74SAndi Kleen addr = __alloc_bootmem_node_nopanic( 11346ae11b27SLee Schermerhorn NODE_DATA(hstate_next_node_to_alloc(h, 11359b5e5d0fSLee Schermerhorn &node_states[N_HIGH_MEMORY])), 1136aa888a74SAndi Kleen huge_page_size(h), huge_page_size(h), 0); 1137aa888a74SAndi Kleen 1138aa888a74SAndi Kleen if (addr) { 1139aa888a74SAndi Kleen /* 1140aa888a74SAndi Kleen * Use the beginning of the huge page to store the 1141aa888a74SAndi Kleen * huge_bootmem_page struct (until gather_bootmem 1142aa888a74SAndi Kleen * puts them into the mem_map). 1143aa888a74SAndi Kleen */ 1144aa888a74SAndi Kleen m = addr; 1145aa888a74SAndi Kleen goto found; 1146aa888a74SAndi Kleen } 1147aa888a74SAndi Kleen nr_nodes--; 1148aa888a74SAndi Kleen } 1149aa888a74SAndi Kleen return 0; 1150aa888a74SAndi Kleen 1151aa888a74SAndi Kleen found: 1152aa888a74SAndi Kleen BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); 1153aa888a74SAndi Kleen /* Put them into a private list first because mem_map is not up yet */ 1154aa888a74SAndi Kleen list_add(&m->list, &huge_boot_pages); 1155aa888a74SAndi Kleen m->hstate = h; 1156aa888a74SAndi Kleen return 1; 1157aa888a74SAndi Kleen } 1158aa888a74SAndi Kleen 115918229df5SAndy Whitcroft static void prep_compound_huge_page(struct page *page, int order) 116018229df5SAndy Whitcroft { 116118229df5SAndy Whitcroft if (unlikely(order > (MAX_ORDER - 1))) 116218229df5SAndy Whitcroft prep_compound_gigantic_page(page, order); 116318229df5SAndy Whitcroft else 116418229df5SAndy Whitcroft prep_compound_page(page, order); 116518229df5SAndy Whitcroft } 116618229df5SAndy Whitcroft 1167aa888a74SAndi Kleen /* Put bootmem huge pages into the standard lists after mem_map is up */ 1168aa888a74SAndi Kleen static void __init gather_bootmem_prealloc(void) 1169aa888a74SAndi Kleen { 1170aa888a74SAndi Kleen struct huge_bootmem_page *m; 1171aa888a74SAndi Kleen 1172aa888a74SAndi Kleen list_for_each_entry(m, &huge_boot_pages, list) { 1173aa888a74SAndi Kleen struct page *page = virt_to_page(m); 1174aa888a74SAndi Kleen struct hstate *h = m->hstate; 1175aa888a74SAndi Kleen __ClearPageReserved(page); 1176aa888a74SAndi Kleen WARN_ON(page_count(page) != 1); 117718229df5SAndy Whitcroft prep_compound_huge_page(page, h->order); 1178aa888a74SAndi Kleen prep_new_huge_page(h, page, page_to_nid(page)); 1179aa888a74SAndi Kleen } 1180aa888a74SAndi Kleen } 1181aa888a74SAndi Kleen 11828faa8b07SAndi Kleen static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 11831da177e4SLinus Torvalds { 11841da177e4SLinus Torvalds unsigned long i; 11851da177e4SLinus Torvalds 1186e5ff2159SAndi Kleen for (i = 0; i < h->max_huge_pages; ++i) { 1187aa888a74SAndi Kleen if (h->order >= MAX_ORDER) { 1188aa888a74SAndi Kleen if (!alloc_bootmem_huge_page(h)) 1189aa888a74SAndi Kleen break; 11909b5e5d0fSLee Schermerhorn } else if (!alloc_fresh_huge_page(h, 11919b5e5d0fSLee Schermerhorn &node_states[N_HIGH_MEMORY])) 11921da177e4SLinus Torvalds break; 11931da177e4SLinus Torvalds } 11948faa8b07SAndi Kleen h->max_huge_pages = i; 1195e5ff2159SAndi Kleen } 1196e5ff2159SAndi Kleen 1197e5ff2159SAndi Kleen static void __init hugetlb_init_hstates(void) 1198e5ff2159SAndi Kleen { 1199e5ff2159SAndi Kleen struct hstate *h; 1200e5ff2159SAndi Kleen 1201e5ff2159SAndi Kleen for_each_hstate(h) { 12028faa8b07SAndi Kleen /* oversize hugepages were init'ed in early boot */ 12038faa8b07SAndi Kleen if (h->order < MAX_ORDER) 12048faa8b07SAndi Kleen hugetlb_hstate_alloc_pages(h); 1205e5ff2159SAndi Kleen } 1206e5ff2159SAndi Kleen } 1207e5ff2159SAndi Kleen 12084abd32dbSAndi Kleen static char * __init memfmt(char *buf, unsigned long n) 12094abd32dbSAndi Kleen { 12104abd32dbSAndi Kleen if (n >= (1UL << 30)) 12114abd32dbSAndi Kleen sprintf(buf, "%lu GB", n >> 30); 12124abd32dbSAndi Kleen else if (n >= (1UL << 20)) 12134abd32dbSAndi Kleen sprintf(buf, "%lu MB", n >> 20); 12144abd32dbSAndi Kleen else 12154abd32dbSAndi Kleen sprintf(buf, "%lu KB", n >> 10); 12164abd32dbSAndi Kleen return buf; 12174abd32dbSAndi Kleen } 12184abd32dbSAndi Kleen 1219e5ff2159SAndi Kleen static void __init report_hugepages(void) 1220e5ff2159SAndi Kleen { 1221e5ff2159SAndi Kleen struct hstate *h; 1222e5ff2159SAndi Kleen 1223e5ff2159SAndi Kleen for_each_hstate(h) { 12244abd32dbSAndi Kleen char buf[32]; 12254abd32dbSAndi Kleen printk(KERN_INFO "HugeTLB registered %s page size, " 12264abd32dbSAndi Kleen "pre-allocated %ld pages\n", 12274abd32dbSAndi Kleen memfmt(buf, huge_page_size(h)), 12284abd32dbSAndi Kleen h->free_huge_pages); 1229e5ff2159SAndi Kleen } 1230e5ff2159SAndi Kleen } 1231e5ff2159SAndi Kleen 12321da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM 12336ae11b27SLee Schermerhorn static void try_to_free_low(struct hstate *h, unsigned long count, 12346ae11b27SLee Schermerhorn nodemask_t *nodes_allowed) 12351da177e4SLinus Torvalds { 12364415cc8dSChristoph Lameter int i; 12374415cc8dSChristoph Lameter 1238aa888a74SAndi Kleen if (h->order >= MAX_ORDER) 1239aa888a74SAndi Kleen return; 1240aa888a74SAndi Kleen 12416ae11b27SLee Schermerhorn for_each_node_mask(i, *nodes_allowed) { 12421da177e4SLinus Torvalds struct page *page, *next; 1243a5516438SAndi Kleen struct list_head *freel = &h->hugepage_freelists[i]; 1244a5516438SAndi Kleen list_for_each_entry_safe(page, next, freel, lru) { 1245a5516438SAndi Kleen if (count >= h->nr_huge_pages) 12466b0c880dSAdam Litke return; 12471da177e4SLinus Torvalds if (PageHighMem(page)) 12481da177e4SLinus Torvalds continue; 12491da177e4SLinus Torvalds list_del(&page->lru); 1250e5ff2159SAndi Kleen update_and_free_page(h, page); 1251a5516438SAndi Kleen h->free_huge_pages--; 1252a5516438SAndi Kleen h->free_huge_pages_node[page_to_nid(page)]--; 12531da177e4SLinus Torvalds } 12541da177e4SLinus Torvalds } 12551da177e4SLinus Torvalds } 12561da177e4SLinus Torvalds #else 12576ae11b27SLee Schermerhorn static inline void try_to_free_low(struct hstate *h, unsigned long count, 12586ae11b27SLee Schermerhorn nodemask_t *nodes_allowed) 12591da177e4SLinus Torvalds { 12601da177e4SLinus Torvalds } 12611da177e4SLinus Torvalds #endif 12621da177e4SLinus Torvalds 126320a0307cSWu Fengguang /* 126420a0307cSWu Fengguang * Increment or decrement surplus_huge_pages. Keep node-specific counters 126520a0307cSWu Fengguang * balanced by operating on them in a round-robin fashion. 126620a0307cSWu Fengguang * Returns 1 if an adjustment was made. 126720a0307cSWu Fengguang */ 12686ae11b27SLee Schermerhorn static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 12696ae11b27SLee Schermerhorn int delta) 127020a0307cSWu Fengguang { 1271e8c5c824SLee Schermerhorn int start_nid, next_nid; 127220a0307cSWu Fengguang int ret = 0; 127320a0307cSWu Fengguang 127420a0307cSWu Fengguang VM_BUG_ON(delta != -1 && delta != 1); 127520a0307cSWu Fengguang 1276e8c5c824SLee Schermerhorn if (delta < 0) 12776ae11b27SLee Schermerhorn start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 1278e8c5c824SLee Schermerhorn else 12796ae11b27SLee Schermerhorn start_nid = hstate_next_node_to_free(h, nodes_allowed); 1280e8c5c824SLee Schermerhorn next_nid = start_nid; 1281e8c5c824SLee Schermerhorn 1282e8c5c824SLee Schermerhorn do { 1283e8c5c824SLee Schermerhorn int nid = next_nid; 1284e8c5c824SLee Schermerhorn if (delta < 0) { 1285e8c5c824SLee Schermerhorn /* 1286e8c5c824SLee Schermerhorn * To shrink on this node, there must be a surplus page 1287e8c5c824SLee Schermerhorn */ 12889a76db09SLee Schermerhorn if (!h->surplus_huge_pages_node[nid]) { 12896ae11b27SLee Schermerhorn next_nid = hstate_next_node_to_alloc(h, 12906ae11b27SLee Schermerhorn nodes_allowed); 129120a0307cSWu Fengguang continue; 1292e8c5c824SLee Schermerhorn } 12939a76db09SLee Schermerhorn } 1294e8c5c824SLee Schermerhorn if (delta > 0) { 1295e8c5c824SLee Schermerhorn /* 1296e8c5c824SLee Schermerhorn * Surplus cannot exceed the total number of pages 1297e8c5c824SLee Schermerhorn */ 1298e8c5c824SLee Schermerhorn if (h->surplus_huge_pages_node[nid] >= 12999a76db09SLee Schermerhorn h->nr_huge_pages_node[nid]) { 13006ae11b27SLee Schermerhorn next_nid = hstate_next_node_to_free(h, 13016ae11b27SLee Schermerhorn nodes_allowed); 130220a0307cSWu Fengguang continue; 1303e8c5c824SLee Schermerhorn } 13049a76db09SLee Schermerhorn } 130520a0307cSWu Fengguang 130620a0307cSWu Fengguang h->surplus_huge_pages += delta; 130720a0307cSWu Fengguang h->surplus_huge_pages_node[nid] += delta; 130820a0307cSWu Fengguang ret = 1; 130920a0307cSWu Fengguang break; 1310e8c5c824SLee Schermerhorn } while (next_nid != start_nid); 131120a0307cSWu Fengguang 131220a0307cSWu Fengguang return ret; 131320a0307cSWu Fengguang } 131420a0307cSWu Fengguang 1315a5516438SAndi Kleen #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 13166ae11b27SLee Schermerhorn static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 13176ae11b27SLee Schermerhorn nodemask_t *nodes_allowed) 13181da177e4SLinus Torvalds { 13197893d1d5SAdam Litke unsigned long min_count, ret; 13201da177e4SLinus Torvalds 1321aa888a74SAndi Kleen if (h->order >= MAX_ORDER) 1322aa888a74SAndi Kleen return h->max_huge_pages; 1323aa888a74SAndi Kleen 13247893d1d5SAdam Litke /* 13257893d1d5SAdam Litke * Increase the pool size 13267893d1d5SAdam Litke * First take pages out of surplus state. Then make up the 13277893d1d5SAdam Litke * remaining difference by allocating fresh huge pages. 1328d1c3fb1fSNishanth Aravamudan * 1329d1c3fb1fSNishanth Aravamudan * We might race with alloc_buddy_huge_page() here and be unable 1330d1c3fb1fSNishanth Aravamudan * to convert a surplus huge page to a normal huge page. That is 1331d1c3fb1fSNishanth Aravamudan * not critical, though, it just means the overall size of the 1332d1c3fb1fSNishanth Aravamudan * pool might be one hugepage larger than it needs to be, but 1333d1c3fb1fSNishanth Aravamudan * within all the constraints specified by the sysctls. 13347893d1d5SAdam Litke */ 13351da177e4SLinus Torvalds spin_lock(&hugetlb_lock); 1336a5516438SAndi Kleen while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 13376ae11b27SLee Schermerhorn if (!adjust_pool_surplus(h, nodes_allowed, -1)) 13387893d1d5SAdam Litke break; 13397893d1d5SAdam Litke } 13407893d1d5SAdam Litke 1341a5516438SAndi Kleen while (count > persistent_huge_pages(h)) { 13427893d1d5SAdam Litke /* 13437893d1d5SAdam Litke * If this allocation races such that we no longer need the 13447893d1d5SAdam Litke * page, free_huge_page will handle it by freeing the page 13457893d1d5SAdam Litke * and reducing the surplus. 13467893d1d5SAdam Litke */ 13477893d1d5SAdam Litke spin_unlock(&hugetlb_lock); 13486ae11b27SLee Schermerhorn ret = alloc_fresh_huge_page(h, nodes_allowed); 13497893d1d5SAdam Litke spin_lock(&hugetlb_lock); 13507893d1d5SAdam Litke if (!ret) 13517893d1d5SAdam Litke goto out; 13527893d1d5SAdam Litke 1353536240f2SMel Gorman /* Bail for signals. Probably ctrl-c from user */ 1354536240f2SMel Gorman if (signal_pending(current)) 1355536240f2SMel Gorman goto out; 13567893d1d5SAdam Litke } 13577893d1d5SAdam Litke 13587893d1d5SAdam Litke /* 13597893d1d5SAdam Litke * Decrease the pool size 13607893d1d5SAdam Litke * First return free pages to the buddy allocator (being careful 13617893d1d5SAdam Litke * to keep enough around to satisfy reservations). Then place 13627893d1d5SAdam Litke * pages into surplus state as needed so the pool will shrink 13637893d1d5SAdam Litke * to the desired size as pages become free. 1364d1c3fb1fSNishanth Aravamudan * 1365d1c3fb1fSNishanth Aravamudan * By placing pages into the surplus state independent of the 1366d1c3fb1fSNishanth Aravamudan * overcommit value, we are allowing the surplus pool size to 1367d1c3fb1fSNishanth Aravamudan * exceed overcommit. There are few sane options here. Since 1368d1c3fb1fSNishanth Aravamudan * alloc_buddy_huge_page() is checking the global counter, 1369d1c3fb1fSNishanth Aravamudan * though, we'll note that we're not allowed to exceed surplus 1370d1c3fb1fSNishanth Aravamudan * and won't grow the pool anywhere else. Not until one of the 1371d1c3fb1fSNishanth Aravamudan * sysctls are changed, or the surplus pages go out of use. 13727893d1d5SAdam Litke */ 1373a5516438SAndi Kleen min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 13746b0c880dSAdam Litke min_count = max(count, min_count); 13756ae11b27SLee Schermerhorn try_to_free_low(h, min_count, nodes_allowed); 1376a5516438SAndi Kleen while (min_count < persistent_huge_pages(h)) { 13776ae11b27SLee Schermerhorn if (!free_pool_huge_page(h, nodes_allowed, 0)) 13781da177e4SLinus Torvalds break; 13791da177e4SLinus Torvalds } 1380a5516438SAndi Kleen while (count < persistent_huge_pages(h)) { 13816ae11b27SLee Schermerhorn if (!adjust_pool_surplus(h, nodes_allowed, 1)) 13827893d1d5SAdam Litke break; 13837893d1d5SAdam Litke } 13847893d1d5SAdam Litke out: 1385a5516438SAndi Kleen ret = persistent_huge_pages(h); 13861da177e4SLinus Torvalds spin_unlock(&hugetlb_lock); 13877893d1d5SAdam Litke return ret; 13881da177e4SLinus Torvalds } 13891da177e4SLinus Torvalds 1390a3437870SNishanth Aravamudan #define HSTATE_ATTR_RO(_name) \ 1391a3437870SNishanth Aravamudan static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1392a3437870SNishanth Aravamudan 1393a3437870SNishanth Aravamudan #define HSTATE_ATTR(_name) \ 1394a3437870SNishanth Aravamudan static struct kobj_attribute _name##_attr = \ 1395a3437870SNishanth Aravamudan __ATTR(_name, 0644, _name##_show, _name##_store) 1396a3437870SNishanth Aravamudan 1397a3437870SNishanth Aravamudan static struct kobject *hugepages_kobj; 1398a3437870SNishanth Aravamudan static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1399a3437870SNishanth Aravamudan 14009a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 14019a305230SLee Schermerhorn 14029a305230SLee Schermerhorn static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 1403a3437870SNishanth Aravamudan { 1404a3437870SNishanth Aravamudan int i; 14059a305230SLee Schermerhorn 1406a3437870SNishanth Aravamudan for (i = 0; i < HUGE_MAX_HSTATE; i++) 14079a305230SLee Schermerhorn if (hstate_kobjs[i] == kobj) { 14089a305230SLee Schermerhorn if (nidp) 14099a305230SLee Schermerhorn *nidp = NUMA_NO_NODE; 1410a3437870SNishanth Aravamudan return &hstates[i]; 14119a305230SLee Schermerhorn } 14129a305230SLee Schermerhorn 14139a305230SLee Schermerhorn return kobj_to_node_hstate(kobj, nidp); 1414a3437870SNishanth Aravamudan } 1415a3437870SNishanth Aravamudan 141606808b08SLee Schermerhorn static ssize_t nr_hugepages_show_common(struct kobject *kobj, 1417a3437870SNishanth Aravamudan struct kobj_attribute *attr, char *buf) 1418a3437870SNishanth Aravamudan { 14199a305230SLee Schermerhorn struct hstate *h; 14209a305230SLee Schermerhorn unsigned long nr_huge_pages; 14219a305230SLee Schermerhorn int nid; 14229a305230SLee Schermerhorn 14239a305230SLee Schermerhorn h = kobj_to_hstate(kobj, &nid); 14249a305230SLee Schermerhorn if (nid == NUMA_NO_NODE) 14259a305230SLee Schermerhorn nr_huge_pages = h->nr_huge_pages; 14269a305230SLee Schermerhorn else 14279a305230SLee Schermerhorn nr_huge_pages = h->nr_huge_pages_node[nid]; 14289a305230SLee Schermerhorn 14299a305230SLee Schermerhorn return sprintf(buf, "%lu\n", nr_huge_pages); 1430a3437870SNishanth Aravamudan } 143106808b08SLee Schermerhorn static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 143206808b08SLee Schermerhorn struct kobject *kobj, struct kobj_attribute *attr, 143306808b08SLee Schermerhorn const char *buf, size_t len) 1434a3437870SNishanth Aravamudan { 1435a3437870SNishanth Aravamudan int err; 14369a305230SLee Schermerhorn int nid; 143706808b08SLee Schermerhorn unsigned long count; 14389a305230SLee Schermerhorn struct hstate *h; 1439bad44b5bSDavid Rientjes NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1440a3437870SNishanth Aravamudan 144106808b08SLee Schermerhorn err = strict_strtoul(buf, 10, &count); 1442a3437870SNishanth Aravamudan if (err) 1443a3437870SNishanth Aravamudan return 0; 1444a3437870SNishanth Aravamudan 14459a305230SLee Schermerhorn h = kobj_to_hstate(kobj, &nid); 14469a305230SLee Schermerhorn if (nid == NUMA_NO_NODE) { 14479a305230SLee Schermerhorn /* 14489a305230SLee Schermerhorn * global hstate attribute 14499a305230SLee Schermerhorn */ 14509a305230SLee Schermerhorn if (!(obey_mempolicy && 14519a305230SLee Schermerhorn init_nodemask_of_mempolicy(nodes_allowed))) { 145206808b08SLee Schermerhorn NODEMASK_FREE(nodes_allowed); 14539a305230SLee Schermerhorn nodes_allowed = &node_states[N_HIGH_MEMORY]; 145406808b08SLee Schermerhorn } 14559a305230SLee Schermerhorn } else if (nodes_allowed) { 14569a305230SLee Schermerhorn /* 14579a305230SLee Schermerhorn * per node hstate attribute: adjust count to global, 14589a305230SLee Schermerhorn * but restrict alloc/free to the specified node. 14599a305230SLee Schermerhorn */ 14609a305230SLee Schermerhorn count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 14619a305230SLee Schermerhorn init_nodemask_of_node(nodes_allowed, nid); 14629a305230SLee Schermerhorn } else 14639a305230SLee Schermerhorn nodes_allowed = &node_states[N_HIGH_MEMORY]; 14649a305230SLee Schermerhorn 146506808b08SLee Schermerhorn h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1466a3437870SNishanth Aravamudan 14679b5e5d0fSLee Schermerhorn if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 146806808b08SLee Schermerhorn NODEMASK_FREE(nodes_allowed); 146906808b08SLee Schermerhorn 147006808b08SLee Schermerhorn return len; 147106808b08SLee Schermerhorn } 147206808b08SLee Schermerhorn 147306808b08SLee Schermerhorn static ssize_t nr_hugepages_show(struct kobject *kobj, 147406808b08SLee Schermerhorn struct kobj_attribute *attr, char *buf) 147506808b08SLee Schermerhorn { 147606808b08SLee Schermerhorn return nr_hugepages_show_common(kobj, attr, buf); 147706808b08SLee Schermerhorn } 147806808b08SLee Schermerhorn 147906808b08SLee Schermerhorn static ssize_t nr_hugepages_store(struct kobject *kobj, 148006808b08SLee Schermerhorn struct kobj_attribute *attr, const char *buf, size_t len) 148106808b08SLee Schermerhorn { 148206808b08SLee Schermerhorn return nr_hugepages_store_common(false, kobj, attr, buf, len); 1483a3437870SNishanth Aravamudan } 1484a3437870SNishanth Aravamudan HSTATE_ATTR(nr_hugepages); 1485a3437870SNishanth Aravamudan 148606808b08SLee Schermerhorn #ifdef CONFIG_NUMA 148706808b08SLee Schermerhorn 148806808b08SLee Schermerhorn /* 148906808b08SLee Schermerhorn * hstate attribute for optionally mempolicy-based constraint on persistent 149006808b08SLee Schermerhorn * huge page alloc/free. 149106808b08SLee Schermerhorn */ 149206808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 149306808b08SLee Schermerhorn struct kobj_attribute *attr, char *buf) 149406808b08SLee Schermerhorn { 149506808b08SLee Schermerhorn return nr_hugepages_show_common(kobj, attr, buf); 149606808b08SLee Schermerhorn } 149706808b08SLee Schermerhorn 149806808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 149906808b08SLee Schermerhorn struct kobj_attribute *attr, const char *buf, size_t len) 150006808b08SLee Schermerhorn { 150106808b08SLee Schermerhorn return nr_hugepages_store_common(true, kobj, attr, buf, len); 150206808b08SLee Schermerhorn } 150306808b08SLee Schermerhorn HSTATE_ATTR(nr_hugepages_mempolicy); 150406808b08SLee Schermerhorn #endif 150506808b08SLee Schermerhorn 150606808b08SLee Schermerhorn 1507a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1508a3437870SNishanth Aravamudan struct kobj_attribute *attr, char *buf) 1509a3437870SNishanth Aravamudan { 15109a305230SLee Schermerhorn struct hstate *h = kobj_to_hstate(kobj, NULL); 1511a3437870SNishanth Aravamudan return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1512a3437870SNishanth Aravamudan } 1513a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1514a3437870SNishanth Aravamudan struct kobj_attribute *attr, const char *buf, size_t count) 1515a3437870SNishanth Aravamudan { 1516a3437870SNishanth Aravamudan int err; 1517a3437870SNishanth Aravamudan unsigned long input; 15189a305230SLee Schermerhorn struct hstate *h = kobj_to_hstate(kobj, NULL); 1519a3437870SNishanth Aravamudan 1520a3437870SNishanth Aravamudan err = strict_strtoul(buf, 10, &input); 1521a3437870SNishanth Aravamudan if (err) 1522a3437870SNishanth Aravamudan return 0; 1523a3437870SNishanth Aravamudan 1524a3437870SNishanth Aravamudan spin_lock(&hugetlb_lock); 1525a3437870SNishanth Aravamudan h->nr_overcommit_huge_pages = input; 1526a3437870SNishanth Aravamudan spin_unlock(&hugetlb_lock); 1527a3437870SNishanth Aravamudan 1528a3437870SNishanth Aravamudan return count; 1529a3437870SNishanth Aravamudan } 1530a3437870SNishanth Aravamudan HSTATE_ATTR(nr_overcommit_hugepages); 1531a3437870SNishanth Aravamudan 1532a3437870SNishanth Aravamudan static ssize_t free_hugepages_show(struct kobject *kobj, 1533a3437870SNishanth Aravamudan struct kobj_attribute *attr, char *buf) 1534a3437870SNishanth Aravamudan { 15359a305230SLee Schermerhorn struct hstate *h; 15369a305230SLee Schermerhorn unsigned long free_huge_pages; 15379a305230SLee Schermerhorn int nid; 15389a305230SLee Schermerhorn 15399a305230SLee Schermerhorn h = kobj_to_hstate(kobj, &nid); 15409a305230SLee Schermerhorn if (nid == NUMA_NO_NODE) 15419a305230SLee Schermerhorn free_huge_pages = h->free_huge_pages; 15429a305230SLee Schermerhorn else 15439a305230SLee Schermerhorn free_huge_pages = h->free_huge_pages_node[nid]; 15449a305230SLee Schermerhorn 15459a305230SLee Schermerhorn return sprintf(buf, "%lu\n", free_huge_pages); 1546a3437870SNishanth Aravamudan } 1547a3437870SNishanth Aravamudan HSTATE_ATTR_RO(free_hugepages); 1548a3437870SNishanth Aravamudan 1549a3437870SNishanth Aravamudan static ssize_t resv_hugepages_show(struct kobject *kobj, 1550a3437870SNishanth Aravamudan struct kobj_attribute *attr, char *buf) 1551a3437870SNishanth Aravamudan { 15529a305230SLee Schermerhorn struct hstate *h = kobj_to_hstate(kobj, NULL); 1553a3437870SNishanth Aravamudan return sprintf(buf, "%lu\n", h->resv_huge_pages); 1554a3437870SNishanth Aravamudan } 1555a3437870SNishanth Aravamudan HSTATE_ATTR_RO(resv_hugepages); 1556a3437870SNishanth Aravamudan 1557a3437870SNishanth Aravamudan static ssize_t surplus_hugepages_show(struct kobject *kobj, 1558a3437870SNishanth Aravamudan struct kobj_attribute *attr, char *buf) 1559a3437870SNishanth Aravamudan { 15609a305230SLee Schermerhorn struct hstate *h; 15619a305230SLee Schermerhorn unsigned long surplus_huge_pages; 15629a305230SLee Schermerhorn int nid; 15639a305230SLee Schermerhorn 15649a305230SLee Schermerhorn h = kobj_to_hstate(kobj, &nid); 15659a305230SLee Schermerhorn if (nid == NUMA_NO_NODE) 15669a305230SLee Schermerhorn surplus_huge_pages = h->surplus_huge_pages; 15679a305230SLee Schermerhorn else 15689a305230SLee Schermerhorn surplus_huge_pages = h->surplus_huge_pages_node[nid]; 15699a305230SLee Schermerhorn 15709a305230SLee Schermerhorn return sprintf(buf, "%lu\n", surplus_huge_pages); 1571a3437870SNishanth Aravamudan } 1572a3437870SNishanth Aravamudan HSTATE_ATTR_RO(surplus_hugepages); 1573a3437870SNishanth Aravamudan 1574a3437870SNishanth Aravamudan static struct attribute *hstate_attrs[] = { 1575a3437870SNishanth Aravamudan &nr_hugepages_attr.attr, 1576a3437870SNishanth Aravamudan &nr_overcommit_hugepages_attr.attr, 1577a3437870SNishanth Aravamudan &free_hugepages_attr.attr, 1578a3437870SNishanth Aravamudan &resv_hugepages_attr.attr, 1579a3437870SNishanth Aravamudan &surplus_hugepages_attr.attr, 158006808b08SLee Schermerhorn #ifdef CONFIG_NUMA 158106808b08SLee Schermerhorn &nr_hugepages_mempolicy_attr.attr, 158206808b08SLee Schermerhorn #endif 1583a3437870SNishanth Aravamudan NULL, 1584a3437870SNishanth Aravamudan }; 1585a3437870SNishanth Aravamudan 1586a3437870SNishanth Aravamudan static struct attribute_group hstate_attr_group = { 1587a3437870SNishanth Aravamudan .attrs = hstate_attrs, 1588a3437870SNishanth Aravamudan }; 1589a3437870SNishanth Aravamudan 1590094e9539SJeff Mahoney static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 15919a305230SLee Schermerhorn struct kobject **hstate_kobjs, 15929a305230SLee Schermerhorn struct attribute_group *hstate_attr_group) 1593a3437870SNishanth Aravamudan { 1594a3437870SNishanth Aravamudan int retval; 15959a305230SLee Schermerhorn int hi = h - hstates; 1596a3437870SNishanth Aravamudan 15979a305230SLee Schermerhorn hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 15989a305230SLee Schermerhorn if (!hstate_kobjs[hi]) 1599a3437870SNishanth Aravamudan return -ENOMEM; 1600a3437870SNishanth Aravamudan 16019a305230SLee Schermerhorn retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 1602a3437870SNishanth Aravamudan if (retval) 16039a305230SLee Schermerhorn kobject_put(hstate_kobjs[hi]); 1604a3437870SNishanth Aravamudan 1605a3437870SNishanth Aravamudan return retval; 1606a3437870SNishanth Aravamudan } 1607a3437870SNishanth Aravamudan 1608a3437870SNishanth Aravamudan static void __init hugetlb_sysfs_init(void) 1609a3437870SNishanth Aravamudan { 1610a3437870SNishanth Aravamudan struct hstate *h; 1611a3437870SNishanth Aravamudan int err; 1612a3437870SNishanth Aravamudan 1613a3437870SNishanth Aravamudan hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 1614a3437870SNishanth Aravamudan if (!hugepages_kobj) 1615a3437870SNishanth Aravamudan return; 1616a3437870SNishanth Aravamudan 1617a3437870SNishanth Aravamudan for_each_hstate(h) { 16189a305230SLee Schermerhorn err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 16199a305230SLee Schermerhorn hstate_kobjs, &hstate_attr_group); 1620a3437870SNishanth Aravamudan if (err) 1621a3437870SNishanth Aravamudan printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1622a3437870SNishanth Aravamudan h->name); 1623a3437870SNishanth Aravamudan } 1624a3437870SNishanth Aravamudan } 1625a3437870SNishanth Aravamudan 16269a305230SLee Schermerhorn #ifdef CONFIG_NUMA 16279a305230SLee Schermerhorn 16289a305230SLee Schermerhorn /* 16299a305230SLee Schermerhorn * node_hstate/s - associate per node hstate attributes, via their kobjects, 16309a305230SLee Schermerhorn * with node sysdevs in node_devices[] using a parallel array. The array 16319a305230SLee Schermerhorn * index of a node sysdev or _hstate == node id. 16329a305230SLee Schermerhorn * This is here to avoid any static dependency of the node sysdev driver, in 16339a305230SLee Schermerhorn * the base kernel, on the hugetlb module. 16349a305230SLee Schermerhorn */ 16359a305230SLee Schermerhorn struct node_hstate { 16369a305230SLee Schermerhorn struct kobject *hugepages_kobj; 16379a305230SLee Schermerhorn struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 16389a305230SLee Schermerhorn }; 16399a305230SLee Schermerhorn struct node_hstate node_hstates[MAX_NUMNODES]; 16409a305230SLee Schermerhorn 16419a305230SLee Schermerhorn /* 16429a305230SLee Schermerhorn * A subset of global hstate attributes for node sysdevs 16439a305230SLee Schermerhorn */ 16449a305230SLee Schermerhorn static struct attribute *per_node_hstate_attrs[] = { 16459a305230SLee Schermerhorn &nr_hugepages_attr.attr, 16469a305230SLee Schermerhorn &free_hugepages_attr.attr, 16479a305230SLee Schermerhorn &surplus_hugepages_attr.attr, 16489a305230SLee Schermerhorn NULL, 16499a305230SLee Schermerhorn }; 16509a305230SLee Schermerhorn 16519a305230SLee Schermerhorn static struct attribute_group per_node_hstate_attr_group = { 16529a305230SLee Schermerhorn .attrs = per_node_hstate_attrs, 16539a305230SLee Schermerhorn }; 16549a305230SLee Schermerhorn 16559a305230SLee Schermerhorn /* 16569a305230SLee Schermerhorn * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. 16579a305230SLee Schermerhorn * Returns node id via non-NULL nidp. 16589a305230SLee Schermerhorn */ 16599a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 16609a305230SLee Schermerhorn { 16619a305230SLee Schermerhorn int nid; 16629a305230SLee Schermerhorn 16639a305230SLee Schermerhorn for (nid = 0; nid < nr_node_ids; nid++) { 16649a305230SLee Schermerhorn struct node_hstate *nhs = &node_hstates[nid]; 16659a305230SLee Schermerhorn int i; 16669a305230SLee Schermerhorn for (i = 0; i < HUGE_MAX_HSTATE; i++) 16679a305230SLee Schermerhorn if (nhs->hstate_kobjs[i] == kobj) { 16689a305230SLee Schermerhorn if (nidp) 16699a305230SLee Schermerhorn *nidp = nid; 16709a305230SLee Schermerhorn return &hstates[i]; 16719a305230SLee Schermerhorn } 16729a305230SLee Schermerhorn } 16739a305230SLee Schermerhorn 16749a305230SLee Schermerhorn BUG(); 16759a305230SLee Schermerhorn return NULL; 16769a305230SLee Schermerhorn } 16779a305230SLee Schermerhorn 16789a305230SLee Schermerhorn /* 16799a305230SLee Schermerhorn * Unregister hstate attributes from a single node sysdev. 16809a305230SLee Schermerhorn * No-op if no hstate attributes attached. 16819a305230SLee Schermerhorn */ 16829a305230SLee Schermerhorn void hugetlb_unregister_node(struct node *node) 16839a305230SLee Schermerhorn { 16849a305230SLee Schermerhorn struct hstate *h; 16859a305230SLee Schermerhorn struct node_hstate *nhs = &node_hstates[node->sysdev.id]; 16869a305230SLee Schermerhorn 16879a305230SLee Schermerhorn if (!nhs->hugepages_kobj) 16889b5e5d0fSLee Schermerhorn return; /* no hstate attributes */ 16899a305230SLee Schermerhorn 16909a305230SLee Schermerhorn for_each_hstate(h) 16919a305230SLee Schermerhorn if (nhs->hstate_kobjs[h - hstates]) { 16929a305230SLee Schermerhorn kobject_put(nhs->hstate_kobjs[h - hstates]); 16939a305230SLee Schermerhorn nhs->hstate_kobjs[h - hstates] = NULL; 16949a305230SLee Schermerhorn } 16959a305230SLee Schermerhorn 16969a305230SLee Schermerhorn kobject_put(nhs->hugepages_kobj); 16979a305230SLee Schermerhorn nhs->hugepages_kobj = NULL; 16989a305230SLee Schermerhorn } 16999a305230SLee Schermerhorn 17009a305230SLee Schermerhorn /* 17019a305230SLee Schermerhorn * hugetlb module exit: unregister hstate attributes from node sysdevs 17029a305230SLee Schermerhorn * that have them. 17039a305230SLee Schermerhorn */ 17049a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void) 17059a305230SLee Schermerhorn { 17069a305230SLee Schermerhorn int nid; 17079a305230SLee Schermerhorn 17089a305230SLee Schermerhorn /* 17099a305230SLee Schermerhorn * disable node sysdev registrations. 17109a305230SLee Schermerhorn */ 17119a305230SLee Schermerhorn register_hugetlbfs_with_node(NULL, NULL); 17129a305230SLee Schermerhorn 17139a305230SLee Schermerhorn /* 17149a305230SLee Schermerhorn * remove hstate attributes from any nodes that have them. 17159a305230SLee Schermerhorn */ 17169a305230SLee Schermerhorn for (nid = 0; nid < nr_node_ids; nid++) 17179a305230SLee Schermerhorn hugetlb_unregister_node(&node_devices[nid]); 17189a305230SLee Schermerhorn } 17199a305230SLee Schermerhorn 17209a305230SLee Schermerhorn /* 17219a305230SLee Schermerhorn * Register hstate attributes for a single node sysdev. 17229a305230SLee Schermerhorn * No-op if attributes already registered. 17239a305230SLee Schermerhorn */ 17249a305230SLee Schermerhorn void hugetlb_register_node(struct node *node) 17259a305230SLee Schermerhorn { 17269a305230SLee Schermerhorn struct hstate *h; 17279a305230SLee Schermerhorn struct node_hstate *nhs = &node_hstates[node->sysdev.id]; 17289a305230SLee Schermerhorn int err; 17299a305230SLee Schermerhorn 17309a305230SLee Schermerhorn if (nhs->hugepages_kobj) 17319a305230SLee Schermerhorn return; /* already allocated */ 17329a305230SLee Schermerhorn 17339a305230SLee Schermerhorn nhs->hugepages_kobj = kobject_create_and_add("hugepages", 17349a305230SLee Schermerhorn &node->sysdev.kobj); 17359a305230SLee Schermerhorn if (!nhs->hugepages_kobj) 17369a305230SLee Schermerhorn return; 17379a305230SLee Schermerhorn 17389a305230SLee Schermerhorn for_each_hstate(h) { 17399a305230SLee Schermerhorn err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 17409a305230SLee Schermerhorn nhs->hstate_kobjs, 17419a305230SLee Schermerhorn &per_node_hstate_attr_group); 17429a305230SLee Schermerhorn if (err) { 17439a305230SLee Schermerhorn printk(KERN_ERR "Hugetlb: Unable to add hstate %s" 17449a305230SLee Schermerhorn " for node %d\n", 17459a305230SLee Schermerhorn h->name, node->sysdev.id); 17469a305230SLee Schermerhorn hugetlb_unregister_node(node); 17479a305230SLee Schermerhorn break; 17489a305230SLee Schermerhorn } 17499a305230SLee Schermerhorn } 17509a305230SLee Schermerhorn } 17519a305230SLee Schermerhorn 17529a305230SLee Schermerhorn /* 17539b5e5d0fSLee Schermerhorn * hugetlb init time: register hstate attributes for all registered node 17549b5e5d0fSLee Schermerhorn * sysdevs of nodes that have memory. All on-line nodes should have 17559b5e5d0fSLee Schermerhorn * registered their associated sysdev by this time. 17569a305230SLee Schermerhorn */ 17579a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) 17589a305230SLee Schermerhorn { 17599a305230SLee Schermerhorn int nid; 17609a305230SLee Schermerhorn 17619b5e5d0fSLee Schermerhorn for_each_node_state(nid, N_HIGH_MEMORY) { 17629a305230SLee Schermerhorn struct node *node = &node_devices[nid]; 17639a305230SLee Schermerhorn if (node->sysdev.id == nid) 17649a305230SLee Schermerhorn hugetlb_register_node(node); 17659a305230SLee Schermerhorn } 17669a305230SLee Schermerhorn 17679a305230SLee Schermerhorn /* 17689a305230SLee Schermerhorn * Let the node sysdev driver know we're here so it can 17699a305230SLee Schermerhorn * [un]register hstate attributes on node hotplug. 17709a305230SLee Schermerhorn */ 17719a305230SLee Schermerhorn register_hugetlbfs_with_node(hugetlb_register_node, 17729a305230SLee Schermerhorn hugetlb_unregister_node); 17739a305230SLee Schermerhorn } 17749a305230SLee Schermerhorn #else /* !CONFIG_NUMA */ 17759a305230SLee Schermerhorn 17769a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 17779a305230SLee Schermerhorn { 17789a305230SLee Schermerhorn BUG(); 17799a305230SLee Schermerhorn if (nidp) 17809a305230SLee Schermerhorn *nidp = -1; 17819a305230SLee Schermerhorn return NULL; 17829a305230SLee Schermerhorn } 17839a305230SLee Schermerhorn 17849a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void) { } 17859a305230SLee Schermerhorn 17869a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) { } 17879a305230SLee Schermerhorn 17889a305230SLee Schermerhorn #endif 17899a305230SLee Schermerhorn 1790a3437870SNishanth Aravamudan static void __exit hugetlb_exit(void) 1791a3437870SNishanth Aravamudan { 1792a3437870SNishanth Aravamudan struct hstate *h; 1793a3437870SNishanth Aravamudan 17949a305230SLee Schermerhorn hugetlb_unregister_all_nodes(); 17959a305230SLee Schermerhorn 1796a3437870SNishanth Aravamudan for_each_hstate(h) { 1797a3437870SNishanth Aravamudan kobject_put(hstate_kobjs[h - hstates]); 1798a3437870SNishanth Aravamudan } 1799a3437870SNishanth Aravamudan 1800a3437870SNishanth Aravamudan kobject_put(hugepages_kobj); 1801a3437870SNishanth Aravamudan } 1802a3437870SNishanth Aravamudan module_exit(hugetlb_exit); 1803a3437870SNishanth Aravamudan 1804a3437870SNishanth Aravamudan static int __init hugetlb_init(void) 1805a3437870SNishanth Aravamudan { 18060ef89d25SBenjamin Herrenschmidt /* Some platform decide whether they support huge pages at boot 18070ef89d25SBenjamin Herrenschmidt * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when 18080ef89d25SBenjamin Herrenschmidt * there is no such support 18090ef89d25SBenjamin Herrenschmidt */ 18100ef89d25SBenjamin Herrenschmidt if (HPAGE_SHIFT == 0) 18110ef89d25SBenjamin Herrenschmidt return 0; 1812a3437870SNishanth Aravamudan 1813e11bfbfcSNick Piggin if (!size_to_hstate(default_hstate_size)) { 1814e11bfbfcSNick Piggin default_hstate_size = HPAGE_SIZE; 1815e11bfbfcSNick Piggin if (!size_to_hstate(default_hstate_size)) 1816a3437870SNishanth Aravamudan hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1817a3437870SNishanth Aravamudan } 1818e11bfbfcSNick Piggin default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1819e11bfbfcSNick Piggin if (default_hstate_max_huge_pages) 1820e11bfbfcSNick Piggin default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1821a3437870SNishanth Aravamudan 1822a3437870SNishanth Aravamudan hugetlb_init_hstates(); 1823a3437870SNishanth Aravamudan 1824aa888a74SAndi Kleen gather_bootmem_prealloc(); 1825aa888a74SAndi Kleen 1826a3437870SNishanth Aravamudan report_hugepages(); 1827a3437870SNishanth Aravamudan 1828a3437870SNishanth Aravamudan hugetlb_sysfs_init(); 1829a3437870SNishanth Aravamudan 18309a305230SLee Schermerhorn hugetlb_register_all_nodes(); 18319a305230SLee Schermerhorn 1832a3437870SNishanth Aravamudan return 0; 1833a3437870SNishanth Aravamudan } 1834a3437870SNishanth Aravamudan module_init(hugetlb_init); 1835a3437870SNishanth Aravamudan 1836a3437870SNishanth Aravamudan /* Should be called on processing a hugepagesz=... option */ 1837a3437870SNishanth Aravamudan void __init hugetlb_add_hstate(unsigned order) 1838a3437870SNishanth Aravamudan { 1839a3437870SNishanth Aravamudan struct hstate *h; 18408faa8b07SAndi Kleen unsigned long i; 18418faa8b07SAndi Kleen 1842a3437870SNishanth Aravamudan if (size_to_hstate(PAGE_SIZE << order)) { 1843a3437870SNishanth Aravamudan printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1844a3437870SNishanth Aravamudan return; 1845a3437870SNishanth Aravamudan } 1846a3437870SNishanth Aravamudan BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1847a3437870SNishanth Aravamudan BUG_ON(order == 0); 1848a3437870SNishanth Aravamudan h = &hstates[max_hstate++]; 1849a3437870SNishanth Aravamudan h->order = order; 1850a3437870SNishanth Aravamudan h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 18518faa8b07SAndi Kleen h->nr_huge_pages = 0; 18528faa8b07SAndi Kleen h->free_huge_pages = 0; 18538faa8b07SAndi Kleen for (i = 0; i < MAX_NUMNODES; ++i) 18548faa8b07SAndi Kleen INIT_LIST_HEAD(&h->hugepage_freelists[i]); 18559b5e5d0fSLee Schermerhorn h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 18569b5e5d0fSLee Schermerhorn h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1857a3437870SNishanth Aravamudan snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1858a3437870SNishanth Aravamudan huge_page_size(h)/1024); 18598faa8b07SAndi Kleen 1860a3437870SNishanth Aravamudan parsed_hstate = h; 1861a3437870SNishanth Aravamudan } 1862a3437870SNishanth Aravamudan 1863e11bfbfcSNick Piggin static int __init hugetlb_nrpages_setup(char *s) 1864a3437870SNishanth Aravamudan { 1865a3437870SNishanth Aravamudan unsigned long *mhp; 18668faa8b07SAndi Kleen static unsigned long *last_mhp; 1867a3437870SNishanth Aravamudan 1868a3437870SNishanth Aravamudan /* 1869a3437870SNishanth Aravamudan * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1870a3437870SNishanth Aravamudan * so this hugepages= parameter goes to the "default hstate". 1871a3437870SNishanth Aravamudan */ 1872a3437870SNishanth Aravamudan if (!max_hstate) 1873a3437870SNishanth Aravamudan mhp = &default_hstate_max_huge_pages; 1874a3437870SNishanth Aravamudan else 1875a3437870SNishanth Aravamudan mhp = &parsed_hstate->max_huge_pages; 1876a3437870SNishanth Aravamudan 18778faa8b07SAndi Kleen if (mhp == last_mhp) { 18788faa8b07SAndi Kleen printk(KERN_WARNING "hugepages= specified twice without " 18798faa8b07SAndi Kleen "interleaving hugepagesz=, ignoring\n"); 18808faa8b07SAndi Kleen return 1; 18818faa8b07SAndi Kleen } 18828faa8b07SAndi Kleen 1883a3437870SNishanth Aravamudan if (sscanf(s, "%lu", mhp) <= 0) 1884a3437870SNishanth Aravamudan *mhp = 0; 1885a3437870SNishanth Aravamudan 18868faa8b07SAndi Kleen /* 18878faa8b07SAndi Kleen * Global state is always initialized later in hugetlb_init. 18888faa8b07SAndi Kleen * But we need to allocate >= MAX_ORDER hstates here early to still 18898faa8b07SAndi Kleen * use the bootmem allocator. 18908faa8b07SAndi Kleen */ 18918faa8b07SAndi Kleen if (max_hstate && parsed_hstate->order >= MAX_ORDER) 18928faa8b07SAndi Kleen hugetlb_hstate_alloc_pages(parsed_hstate); 18938faa8b07SAndi Kleen 18948faa8b07SAndi Kleen last_mhp = mhp; 18958faa8b07SAndi Kleen 1896a3437870SNishanth Aravamudan return 1; 1897a3437870SNishanth Aravamudan } 1898e11bfbfcSNick Piggin __setup("hugepages=", hugetlb_nrpages_setup); 1899e11bfbfcSNick Piggin 1900e11bfbfcSNick Piggin static int __init hugetlb_default_setup(char *s) 1901e11bfbfcSNick Piggin { 1902e11bfbfcSNick Piggin default_hstate_size = memparse(s, &s); 1903e11bfbfcSNick Piggin return 1; 1904e11bfbfcSNick Piggin } 1905e11bfbfcSNick Piggin __setup("default_hugepagesz=", hugetlb_default_setup); 1906a3437870SNishanth Aravamudan 19078a213460SNishanth Aravamudan static unsigned int cpuset_mems_nr(unsigned int *array) 19088a213460SNishanth Aravamudan { 19098a213460SNishanth Aravamudan int node; 19108a213460SNishanth Aravamudan unsigned int nr = 0; 19118a213460SNishanth Aravamudan 19128a213460SNishanth Aravamudan for_each_node_mask(node, cpuset_current_mems_allowed) 19138a213460SNishanth Aravamudan nr += array[node]; 19148a213460SNishanth Aravamudan 19158a213460SNishanth Aravamudan return nr; 19168a213460SNishanth Aravamudan } 19178a213460SNishanth Aravamudan 19188a213460SNishanth Aravamudan #ifdef CONFIG_SYSCTL 191906808b08SLee Schermerhorn static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 192006808b08SLee Schermerhorn struct ctl_table *table, int write, 192106808b08SLee Schermerhorn void __user *buffer, size_t *length, loff_t *ppos) 19221da177e4SLinus Torvalds { 1923e5ff2159SAndi Kleen struct hstate *h = &default_hstate; 1924e5ff2159SAndi Kleen unsigned long tmp; 1925e5ff2159SAndi Kleen 1926e5ff2159SAndi Kleen if (!write) 1927e5ff2159SAndi Kleen tmp = h->max_huge_pages; 1928e5ff2159SAndi Kleen 1929e5ff2159SAndi Kleen table->data = &tmp; 1930e5ff2159SAndi Kleen table->maxlen = sizeof(unsigned long); 19318d65af78SAlexey Dobriyan proc_doulongvec_minmax(table, write, buffer, length, ppos); 1932e5ff2159SAndi Kleen 193306808b08SLee Schermerhorn if (write) { 1934bad44b5bSDavid Rientjes NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1935bad44b5bSDavid Rientjes GFP_KERNEL | __GFP_NORETRY); 193606808b08SLee Schermerhorn if (!(obey_mempolicy && 193706808b08SLee Schermerhorn init_nodemask_of_mempolicy(nodes_allowed))) { 193806808b08SLee Schermerhorn NODEMASK_FREE(nodes_allowed); 193906808b08SLee Schermerhorn nodes_allowed = &node_states[N_HIGH_MEMORY]; 194006808b08SLee Schermerhorn } 194106808b08SLee Schermerhorn h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 194206808b08SLee Schermerhorn 194306808b08SLee Schermerhorn if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 194406808b08SLee Schermerhorn NODEMASK_FREE(nodes_allowed); 194506808b08SLee Schermerhorn } 1946e5ff2159SAndi Kleen 19471da177e4SLinus Torvalds return 0; 19481da177e4SLinus Torvalds } 1949396faf03SMel Gorman 195006808b08SLee Schermerhorn int hugetlb_sysctl_handler(struct ctl_table *table, int write, 195106808b08SLee Schermerhorn void __user *buffer, size_t *length, loff_t *ppos) 195206808b08SLee Schermerhorn { 195306808b08SLee Schermerhorn 195406808b08SLee Schermerhorn return hugetlb_sysctl_handler_common(false, table, write, 195506808b08SLee Schermerhorn buffer, length, ppos); 195606808b08SLee Schermerhorn } 195706808b08SLee Schermerhorn 195806808b08SLee Schermerhorn #ifdef CONFIG_NUMA 195906808b08SLee Schermerhorn int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 196006808b08SLee Schermerhorn void __user *buffer, size_t *length, loff_t *ppos) 196106808b08SLee Schermerhorn { 196206808b08SLee Schermerhorn return hugetlb_sysctl_handler_common(true, table, write, 196306808b08SLee Schermerhorn buffer, length, ppos); 196406808b08SLee Schermerhorn } 196506808b08SLee Schermerhorn #endif /* CONFIG_NUMA */ 196606808b08SLee Schermerhorn 1967396faf03SMel Gorman int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 19688d65af78SAlexey Dobriyan void __user *buffer, 1969396faf03SMel Gorman size_t *length, loff_t *ppos) 1970396faf03SMel Gorman { 19718d65af78SAlexey Dobriyan proc_dointvec(table, write, buffer, length, ppos); 1972396faf03SMel Gorman if (hugepages_treat_as_movable) 1973396faf03SMel Gorman htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1974396faf03SMel Gorman else 1975396faf03SMel Gorman htlb_alloc_mask = GFP_HIGHUSER; 1976396faf03SMel Gorman return 0; 1977396faf03SMel Gorman } 1978396faf03SMel Gorman 1979a3d0c6aaSNishanth Aravamudan int hugetlb_overcommit_handler(struct ctl_table *table, int write, 19808d65af78SAlexey Dobriyan void __user *buffer, 1981a3d0c6aaSNishanth Aravamudan size_t *length, loff_t *ppos) 1982a3d0c6aaSNishanth Aravamudan { 1983a5516438SAndi Kleen struct hstate *h = &default_hstate; 1984e5ff2159SAndi Kleen unsigned long tmp; 1985e5ff2159SAndi Kleen 1986e5ff2159SAndi Kleen if (!write) 1987e5ff2159SAndi Kleen tmp = h->nr_overcommit_huge_pages; 1988e5ff2159SAndi Kleen 1989e5ff2159SAndi Kleen table->data = &tmp; 1990e5ff2159SAndi Kleen table->maxlen = sizeof(unsigned long); 19918d65af78SAlexey Dobriyan proc_doulongvec_minmax(table, write, buffer, length, ppos); 1992e5ff2159SAndi Kleen 1993e5ff2159SAndi Kleen if (write) { 1994064d9efeSNishanth Aravamudan spin_lock(&hugetlb_lock); 1995e5ff2159SAndi Kleen h->nr_overcommit_huge_pages = tmp; 1996a3d0c6aaSNishanth Aravamudan spin_unlock(&hugetlb_lock); 1997e5ff2159SAndi Kleen } 1998e5ff2159SAndi Kleen 1999a3d0c6aaSNishanth Aravamudan return 0; 2000a3d0c6aaSNishanth Aravamudan } 2001a3d0c6aaSNishanth Aravamudan 20021da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */ 20031da177e4SLinus Torvalds 2004e1759c21SAlexey Dobriyan void hugetlb_report_meminfo(struct seq_file *m) 20051da177e4SLinus Torvalds { 2006a5516438SAndi Kleen struct hstate *h = &default_hstate; 2007e1759c21SAlexey Dobriyan seq_printf(m, 20081da177e4SLinus Torvalds "HugePages_Total: %5lu\n" 20091da177e4SLinus Torvalds "HugePages_Free: %5lu\n" 2010b45b5bd6SDavid Gibson "HugePages_Rsvd: %5lu\n" 20117893d1d5SAdam Litke "HugePages_Surp: %5lu\n" 20124f98a2feSRik van Riel "Hugepagesize: %8lu kB\n", 2013a5516438SAndi Kleen h->nr_huge_pages, 2014a5516438SAndi Kleen h->free_huge_pages, 2015a5516438SAndi Kleen h->resv_huge_pages, 2016a5516438SAndi Kleen h->surplus_huge_pages, 2017a5516438SAndi Kleen 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 20181da177e4SLinus Torvalds } 20191da177e4SLinus Torvalds 20201da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf) 20211da177e4SLinus Torvalds { 2022a5516438SAndi Kleen struct hstate *h = &default_hstate; 20231da177e4SLinus Torvalds return sprintf(buf, 20241da177e4SLinus Torvalds "Node %d HugePages_Total: %5u\n" 2025a1de0919SNishanth Aravamudan "Node %d HugePages_Free: %5u\n" 2026a1de0919SNishanth Aravamudan "Node %d HugePages_Surp: %5u\n", 2027a5516438SAndi Kleen nid, h->nr_huge_pages_node[nid], 2028a5516438SAndi Kleen nid, h->free_huge_pages_node[nid], 2029a5516438SAndi Kleen nid, h->surplus_huge_pages_node[nid]); 20301da177e4SLinus Torvalds } 20311da177e4SLinus Torvalds 20321da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 20331da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void) 20341da177e4SLinus Torvalds { 2035a5516438SAndi Kleen struct hstate *h = &default_hstate; 2036a5516438SAndi Kleen return h->nr_huge_pages * pages_per_huge_page(h); 20371da177e4SLinus Torvalds } 20381da177e4SLinus Torvalds 2039a5516438SAndi Kleen static int hugetlb_acct_memory(struct hstate *h, long delta) 2040fc1b8a73SMel Gorman { 2041fc1b8a73SMel Gorman int ret = -ENOMEM; 2042fc1b8a73SMel Gorman 2043fc1b8a73SMel Gorman spin_lock(&hugetlb_lock); 2044fc1b8a73SMel Gorman /* 2045fc1b8a73SMel Gorman * When cpuset is configured, it breaks the strict hugetlb page 2046fc1b8a73SMel Gorman * reservation as the accounting is done on a global variable. Such 2047fc1b8a73SMel Gorman * reservation is completely rubbish in the presence of cpuset because 2048fc1b8a73SMel Gorman * the reservation is not checked against page availability for the 2049fc1b8a73SMel Gorman * current cpuset. Application can still potentially OOM'ed by kernel 2050fc1b8a73SMel Gorman * with lack of free htlb page in cpuset that the task is in. 2051fc1b8a73SMel Gorman * Attempt to enforce strict accounting with cpuset is almost 2052fc1b8a73SMel Gorman * impossible (or too ugly) because cpuset is too fluid that 2053fc1b8a73SMel Gorman * task or memory node can be dynamically moved between cpusets. 2054fc1b8a73SMel Gorman * 2055fc1b8a73SMel Gorman * The change of semantics for shared hugetlb mapping with cpuset is 2056fc1b8a73SMel Gorman * undesirable. However, in order to preserve some of the semantics, 2057fc1b8a73SMel Gorman * we fall back to check against current free page availability as 2058fc1b8a73SMel Gorman * a best attempt and hopefully to minimize the impact of changing 2059fc1b8a73SMel Gorman * semantics that cpuset has. 2060fc1b8a73SMel Gorman */ 2061fc1b8a73SMel Gorman if (delta > 0) { 2062a5516438SAndi Kleen if (gather_surplus_pages(h, delta) < 0) 2063fc1b8a73SMel Gorman goto out; 2064fc1b8a73SMel Gorman 2065a5516438SAndi Kleen if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 2066a5516438SAndi Kleen return_unused_surplus_pages(h, delta); 2067fc1b8a73SMel Gorman goto out; 2068fc1b8a73SMel Gorman } 2069fc1b8a73SMel Gorman } 2070fc1b8a73SMel Gorman 2071fc1b8a73SMel Gorman ret = 0; 2072fc1b8a73SMel Gorman if (delta < 0) 2073a5516438SAndi Kleen return_unused_surplus_pages(h, (unsigned long) -delta); 2074fc1b8a73SMel Gorman 2075fc1b8a73SMel Gorman out: 2076fc1b8a73SMel Gorman spin_unlock(&hugetlb_lock); 2077fc1b8a73SMel Gorman return ret; 2078fc1b8a73SMel Gorman } 2079fc1b8a73SMel Gorman 208084afd99bSAndy Whitcroft static void hugetlb_vm_op_open(struct vm_area_struct *vma) 208184afd99bSAndy Whitcroft { 208284afd99bSAndy Whitcroft struct resv_map *reservations = vma_resv_map(vma); 208384afd99bSAndy Whitcroft 208484afd99bSAndy Whitcroft /* 208584afd99bSAndy Whitcroft * This new VMA should share its siblings reservation map if present. 208684afd99bSAndy Whitcroft * The VMA will only ever have a valid reservation map pointer where 208784afd99bSAndy Whitcroft * it is being copied for another still existing VMA. As that VMA 208884afd99bSAndy Whitcroft * has a reference to the reservation map it cannot dissappear until 208984afd99bSAndy Whitcroft * after this open call completes. It is therefore safe to take a 209084afd99bSAndy Whitcroft * new reference here without additional locking. 209184afd99bSAndy Whitcroft */ 209284afd99bSAndy Whitcroft if (reservations) 209384afd99bSAndy Whitcroft kref_get(&reservations->refs); 209484afd99bSAndy Whitcroft } 209584afd99bSAndy Whitcroft 2096a1e78772SMel Gorman static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2097a1e78772SMel Gorman { 2098a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 209984afd99bSAndy Whitcroft struct resv_map *reservations = vma_resv_map(vma); 210084afd99bSAndy Whitcroft unsigned long reserve; 210184afd99bSAndy Whitcroft unsigned long start; 210284afd99bSAndy Whitcroft unsigned long end; 210384afd99bSAndy Whitcroft 210484afd99bSAndy Whitcroft if (reservations) { 2105a5516438SAndi Kleen start = vma_hugecache_offset(h, vma, vma->vm_start); 2106a5516438SAndi Kleen end = vma_hugecache_offset(h, vma, vma->vm_end); 210784afd99bSAndy Whitcroft 210884afd99bSAndy Whitcroft reserve = (end - start) - 210984afd99bSAndy Whitcroft region_count(&reservations->regions, start, end); 211084afd99bSAndy Whitcroft 211184afd99bSAndy Whitcroft kref_put(&reservations->refs, resv_map_release); 211284afd99bSAndy Whitcroft 21137251ff78SAdam Litke if (reserve) { 2114a5516438SAndi Kleen hugetlb_acct_memory(h, -reserve); 21157251ff78SAdam Litke hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 21167251ff78SAdam Litke } 2117a1e78772SMel Gorman } 211884afd99bSAndy Whitcroft } 2119a1e78772SMel Gorman 21201da177e4SLinus Torvalds /* 21211da177e4SLinus Torvalds * We cannot handle pagefaults against hugetlb pages at all. They cause 21221da177e4SLinus Torvalds * handle_mm_fault() to try to instantiate regular-sized pages in the 21231da177e4SLinus Torvalds * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 21241da177e4SLinus Torvalds * this far. 21251da177e4SLinus Torvalds */ 2126d0217ac0SNick Piggin static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 21271da177e4SLinus Torvalds { 21281da177e4SLinus Torvalds BUG(); 2129d0217ac0SNick Piggin return 0; 21301da177e4SLinus Torvalds } 21311da177e4SLinus Torvalds 2132f0f37e2fSAlexey Dobriyan const struct vm_operations_struct hugetlb_vm_ops = { 2133d0217ac0SNick Piggin .fault = hugetlb_vm_op_fault, 213484afd99bSAndy Whitcroft .open = hugetlb_vm_op_open, 2135a1e78772SMel Gorman .close = hugetlb_vm_op_close, 21361da177e4SLinus Torvalds }; 21371da177e4SLinus Torvalds 21381e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 21391e8f889bSDavid Gibson int writable) 214063551ae0SDavid Gibson { 214163551ae0SDavid Gibson pte_t entry; 214263551ae0SDavid Gibson 21431e8f889bSDavid Gibson if (writable) { 214463551ae0SDavid Gibson entry = 214563551ae0SDavid Gibson pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 214663551ae0SDavid Gibson } else { 21477f2e9525SGerald Schaefer entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 214863551ae0SDavid Gibson } 214963551ae0SDavid Gibson entry = pte_mkyoung(entry); 215063551ae0SDavid Gibson entry = pte_mkhuge(entry); 215163551ae0SDavid Gibson 215263551ae0SDavid Gibson return entry; 215363551ae0SDavid Gibson } 215463551ae0SDavid Gibson 21551e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma, 21561e8f889bSDavid Gibson unsigned long address, pte_t *ptep) 21571e8f889bSDavid Gibson { 21581e8f889bSDavid Gibson pte_t entry; 21591e8f889bSDavid Gibson 21607f2e9525SGerald Schaefer entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 21617f2e9525SGerald Schaefer if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 21624b3073e1SRussell King update_mmu_cache(vma, address, ptep); 21631e8f889bSDavid Gibson } 21648dab5241SBenjamin Herrenschmidt } 21651e8f889bSDavid Gibson 21661e8f889bSDavid Gibson 216763551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 216863551ae0SDavid Gibson struct vm_area_struct *vma) 216963551ae0SDavid Gibson { 217063551ae0SDavid Gibson pte_t *src_pte, *dst_pte, entry; 217163551ae0SDavid Gibson struct page *ptepage; 21721c59827dSHugh Dickins unsigned long addr; 21731e8f889bSDavid Gibson int cow; 2174a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 2175a5516438SAndi Kleen unsigned long sz = huge_page_size(h); 21761e8f889bSDavid Gibson 21771e8f889bSDavid Gibson cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 217863551ae0SDavid Gibson 2179a5516438SAndi Kleen for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2180c74df32cSHugh Dickins src_pte = huge_pte_offset(src, addr); 2181c74df32cSHugh Dickins if (!src_pte) 2182c74df32cSHugh Dickins continue; 2183a5516438SAndi Kleen dst_pte = huge_pte_alloc(dst, addr, sz); 218463551ae0SDavid Gibson if (!dst_pte) 218563551ae0SDavid Gibson goto nomem; 2186c5c99429SLarry Woodman 2187c5c99429SLarry Woodman /* If the pagetables are shared don't copy or take references */ 2188c5c99429SLarry Woodman if (dst_pte == src_pte) 2189c5c99429SLarry Woodman continue; 2190c5c99429SLarry Woodman 2191c74df32cSHugh Dickins spin_lock(&dst->page_table_lock); 219246478758SNick Piggin spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); 21937f2e9525SGerald Schaefer if (!huge_pte_none(huge_ptep_get(src_pte))) { 21941e8f889bSDavid Gibson if (cow) 21957f2e9525SGerald Schaefer huge_ptep_set_wrprotect(src, addr, src_pte); 21967f2e9525SGerald Schaefer entry = huge_ptep_get(src_pte); 219763551ae0SDavid Gibson ptepage = pte_page(entry); 219863551ae0SDavid Gibson get_page(ptepage); 21990fe6e20bSNaoya Horiguchi page_dup_rmap(ptepage); 220063551ae0SDavid Gibson set_huge_pte_at(dst, addr, dst_pte, entry); 22011c59827dSHugh Dickins } 22021c59827dSHugh Dickins spin_unlock(&src->page_table_lock); 2203c74df32cSHugh Dickins spin_unlock(&dst->page_table_lock); 220463551ae0SDavid Gibson } 220563551ae0SDavid Gibson return 0; 220663551ae0SDavid Gibson 220763551ae0SDavid Gibson nomem: 220863551ae0SDavid Gibson return -ENOMEM; 220963551ae0SDavid Gibson } 221063551ae0SDavid Gibson 2211290408d4SNaoya Horiguchi static int is_hugetlb_entry_migration(pte_t pte) 2212290408d4SNaoya Horiguchi { 2213290408d4SNaoya Horiguchi swp_entry_t swp; 2214290408d4SNaoya Horiguchi 2215290408d4SNaoya Horiguchi if (huge_pte_none(pte) || pte_present(pte)) 2216290408d4SNaoya Horiguchi return 0; 2217290408d4SNaoya Horiguchi swp = pte_to_swp_entry(pte); 2218290408d4SNaoya Horiguchi if (non_swap_entry(swp) && is_migration_entry(swp)) { 2219290408d4SNaoya Horiguchi return 1; 2220290408d4SNaoya Horiguchi } else 2221290408d4SNaoya Horiguchi return 0; 2222290408d4SNaoya Horiguchi } 2223290408d4SNaoya Horiguchi 2224fd6a03edSNaoya Horiguchi static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2225fd6a03edSNaoya Horiguchi { 2226fd6a03edSNaoya Horiguchi swp_entry_t swp; 2227fd6a03edSNaoya Horiguchi 2228fd6a03edSNaoya Horiguchi if (huge_pte_none(pte) || pte_present(pte)) 2229fd6a03edSNaoya Horiguchi return 0; 2230fd6a03edSNaoya Horiguchi swp = pte_to_swp_entry(pte); 2231fd6a03edSNaoya Horiguchi if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { 2232fd6a03edSNaoya Horiguchi return 1; 2233fd6a03edSNaoya Horiguchi } else 2234fd6a03edSNaoya Horiguchi return 0; 2235fd6a03edSNaoya Horiguchi } 2236fd6a03edSNaoya Horiguchi 2237502717f4SChen, Kenneth W void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 223804f2cbe3SMel Gorman unsigned long end, struct page *ref_page) 223963551ae0SDavid Gibson { 224063551ae0SDavid Gibson struct mm_struct *mm = vma->vm_mm; 224163551ae0SDavid Gibson unsigned long address; 2242c7546f8fSDavid Gibson pte_t *ptep; 224363551ae0SDavid Gibson pte_t pte; 224463551ae0SDavid Gibson struct page *page; 2245fe1668aeSChen, Kenneth W struct page *tmp; 2246a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 2247a5516438SAndi Kleen unsigned long sz = huge_page_size(h); 2248a5516438SAndi Kleen 2249c0a499c2SChen, Kenneth W /* 2250c0a499c2SChen, Kenneth W * A page gathering list, protected by per file i_mmap_lock. The 2251c0a499c2SChen, Kenneth W * lock is used to avoid list corruption from multiple unmapping 2252c0a499c2SChen, Kenneth W * of the same page since we are using page->lru. 2253c0a499c2SChen, Kenneth W */ 2254fe1668aeSChen, Kenneth W LIST_HEAD(page_list); 225563551ae0SDavid Gibson 225663551ae0SDavid Gibson WARN_ON(!is_vm_hugetlb_page(vma)); 2257a5516438SAndi Kleen BUG_ON(start & ~huge_page_mask(h)); 2258a5516438SAndi Kleen BUG_ON(end & ~huge_page_mask(h)); 225963551ae0SDavid Gibson 2260cddb8a5cSAndrea Arcangeli mmu_notifier_invalidate_range_start(mm, start, end); 2261508034a3SHugh Dickins spin_lock(&mm->page_table_lock); 2262a5516438SAndi Kleen for (address = start; address < end; address += sz) { 2263c7546f8fSDavid Gibson ptep = huge_pte_offset(mm, address); 2264c7546f8fSDavid Gibson if (!ptep) 2265c7546f8fSDavid Gibson continue; 2266c7546f8fSDavid Gibson 226739dde65cSChen, Kenneth W if (huge_pmd_unshare(mm, &address, ptep)) 226839dde65cSChen, Kenneth W continue; 226939dde65cSChen, Kenneth W 227004f2cbe3SMel Gorman /* 227104f2cbe3SMel Gorman * If a reference page is supplied, it is because a specific 227204f2cbe3SMel Gorman * page is being unmapped, not a range. Ensure the page we 227304f2cbe3SMel Gorman * are about to unmap is the actual page of interest. 227404f2cbe3SMel Gorman */ 227504f2cbe3SMel Gorman if (ref_page) { 227604f2cbe3SMel Gorman pte = huge_ptep_get(ptep); 227704f2cbe3SMel Gorman if (huge_pte_none(pte)) 227804f2cbe3SMel Gorman continue; 227904f2cbe3SMel Gorman page = pte_page(pte); 228004f2cbe3SMel Gorman if (page != ref_page) 228104f2cbe3SMel Gorman continue; 228204f2cbe3SMel Gorman 228304f2cbe3SMel Gorman /* 228404f2cbe3SMel Gorman * Mark the VMA as having unmapped its page so that 228504f2cbe3SMel Gorman * future faults in this VMA will fail rather than 228604f2cbe3SMel Gorman * looking like data was lost 228704f2cbe3SMel Gorman */ 228804f2cbe3SMel Gorman set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 228904f2cbe3SMel Gorman } 229004f2cbe3SMel Gorman 2291c7546f8fSDavid Gibson pte = huge_ptep_get_and_clear(mm, address, ptep); 22927f2e9525SGerald Schaefer if (huge_pte_none(pte)) 229363551ae0SDavid Gibson continue; 2294c7546f8fSDavid Gibson 2295fd6a03edSNaoya Horiguchi /* 2296fd6a03edSNaoya Horiguchi * HWPoisoned hugepage is already unmapped and dropped reference 2297fd6a03edSNaoya Horiguchi */ 2298fd6a03edSNaoya Horiguchi if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) 2299fd6a03edSNaoya Horiguchi continue; 2300fd6a03edSNaoya Horiguchi 230163551ae0SDavid Gibson page = pte_page(pte); 23026649a386SKen Chen if (pte_dirty(pte)) 23036649a386SKen Chen set_page_dirty(page); 2304fe1668aeSChen, Kenneth W list_add(&page->lru, &page_list); 230563551ae0SDavid Gibson } 23061da177e4SLinus Torvalds spin_unlock(&mm->page_table_lock); 2307508034a3SHugh Dickins flush_tlb_range(vma, start, end); 2308cddb8a5cSAndrea Arcangeli mmu_notifier_invalidate_range_end(mm, start, end); 2309fe1668aeSChen, Kenneth W list_for_each_entry_safe(page, tmp, &page_list, lru) { 23100fe6e20bSNaoya Horiguchi page_remove_rmap(page); 2311fe1668aeSChen, Kenneth W list_del(&page->lru); 2312fe1668aeSChen, Kenneth W put_page(page); 2313fe1668aeSChen, Kenneth W } 23141da177e4SLinus Torvalds } 231563551ae0SDavid Gibson 2316502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 231704f2cbe3SMel Gorman unsigned long end, struct page *ref_page) 2318502717f4SChen, Kenneth W { 2319502717f4SChen, Kenneth W spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 232004f2cbe3SMel Gorman __unmap_hugepage_range(vma, start, end, ref_page); 2321502717f4SChen, Kenneth W spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2322502717f4SChen, Kenneth W } 2323502717f4SChen, Kenneth W 232404f2cbe3SMel Gorman /* 232504f2cbe3SMel Gorman * This is called when the original mapper is failing to COW a MAP_PRIVATE 232604f2cbe3SMel Gorman * mappping it owns the reserve page for. The intention is to unmap the page 232704f2cbe3SMel Gorman * from other VMAs and let the children be SIGKILLed if they are faulting the 232804f2cbe3SMel Gorman * same region. 232904f2cbe3SMel Gorman */ 23302a4b3dedSHarvey Harrison static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 23312a4b3dedSHarvey Harrison struct page *page, unsigned long address) 233204f2cbe3SMel Gorman { 23337526674dSAdam Litke struct hstate *h = hstate_vma(vma); 233404f2cbe3SMel Gorman struct vm_area_struct *iter_vma; 233504f2cbe3SMel Gorman struct address_space *mapping; 233604f2cbe3SMel Gorman struct prio_tree_iter iter; 233704f2cbe3SMel Gorman pgoff_t pgoff; 233804f2cbe3SMel Gorman 233904f2cbe3SMel Gorman /* 234004f2cbe3SMel Gorman * vm_pgoff is in PAGE_SIZE units, hence the different calculation 234104f2cbe3SMel Gorman * from page cache lookup which is in HPAGE_SIZE units. 234204f2cbe3SMel Gorman */ 23437526674dSAdam Litke address = address & huge_page_mask(h); 234404f2cbe3SMel Gorman pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) 234504f2cbe3SMel Gorman + (vma->vm_pgoff >> PAGE_SHIFT); 234604f2cbe3SMel Gorman mapping = (struct address_space *)page_private(page); 234704f2cbe3SMel Gorman 23484eb2b1dcSMel Gorman /* 23494eb2b1dcSMel Gorman * Take the mapping lock for the duration of the table walk. As 23504eb2b1dcSMel Gorman * this mapping should be shared between all the VMAs, 23514eb2b1dcSMel Gorman * __unmap_hugepage_range() is called as the lock is already held 23524eb2b1dcSMel Gorman */ 23534eb2b1dcSMel Gorman spin_lock(&mapping->i_mmap_lock); 235404f2cbe3SMel Gorman vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 235504f2cbe3SMel Gorman /* Do not unmap the current VMA */ 235604f2cbe3SMel Gorman if (iter_vma == vma) 235704f2cbe3SMel Gorman continue; 235804f2cbe3SMel Gorman 235904f2cbe3SMel Gorman /* 236004f2cbe3SMel Gorman * Unmap the page from other VMAs without their own reserves. 236104f2cbe3SMel Gorman * They get marked to be SIGKILLed if they fault in these 236204f2cbe3SMel Gorman * areas. This is because a future no-page fault on this VMA 236304f2cbe3SMel Gorman * could insert a zeroed page instead of the data existing 236404f2cbe3SMel Gorman * from the time of fork. This would look like data corruption 236504f2cbe3SMel Gorman */ 236604f2cbe3SMel Gorman if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 23674eb2b1dcSMel Gorman __unmap_hugepage_range(iter_vma, 23687526674dSAdam Litke address, address + huge_page_size(h), 236904f2cbe3SMel Gorman page); 237004f2cbe3SMel Gorman } 23714eb2b1dcSMel Gorman spin_unlock(&mapping->i_mmap_lock); 237204f2cbe3SMel Gorman 237304f2cbe3SMel Gorman return 1; 237404f2cbe3SMel Gorman } 237504f2cbe3SMel Gorman 23760fe6e20bSNaoya Horiguchi /* 23770fe6e20bSNaoya Horiguchi * Hugetlb_cow() should be called with page lock of the original hugepage held. 23780fe6e20bSNaoya Horiguchi */ 23791e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 238004f2cbe3SMel Gorman unsigned long address, pte_t *ptep, pte_t pte, 238104f2cbe3SMel Gorman struct page *pagecache_page) 23821e8f889bSDavid Gibson { 2383a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 23841e8f889bSDavid Gibson struct page *old_page, *new_page; 238579ac6ba4SDavid Gibson int avoidcopy; 238604f2cbe3SMel Gorman int outside_reserve = 0; 23871e8f889bSDavid Gibson 23881e8f889bSDavid Gibson old_page = pte_page(pte); 23891e8f889bSDavid Gibson 239004f2cbe3SMel Gorman retry_avoidcopy: 23911e8f889bSDavid Gibson /* If no-one else is actually using this page, avoid the copy 23921e8f889bSDavid Gibson * and just make the page writable */ 23930fe6e20bSNaoya Horiguchi avoidcopy = (page_mapcount(old_page) == 1); 23941e8f889bSDavid Gibson if (avoidcopy) { 23950fe6e20bSNaoya Horiguchi if (PageAnon(old_page)) 23960fe6e20bSNaoya Horiguchi page_move_anon_rmap(old_page, vma, address); 23971e8f889bSDavid Gibson set_huge_ptep_writable(vma, address, ptep); 239883c54070SNick Piggin return 0; 23991e8f889bSDavid Gibson } 24001e8f889bSDavid Gibson 240104f2cbe3SMel Gorman /* 240204f2cbe3SMel Gorman * If the process that created a MAP_PRIVATE mapping is about to 240304f2cbe3SMel Gorman * perform a COW due to a shared page count, attempt to satisfy 240404f2cbe3SMel Gorman * the allocation without using the existing reserves. The pagecache 240504f2cbe3SMel Gorman * page is used to determine if the reserve at this address was 240604f2cbe3SMel Gorman * consumed or not. If reserves were used, a partial faulted mapping 240704f2cbe3SMel Gorman * at the time of fork() could consume its reserves on COW instead 240804f2cbe3SMel Gorman * of the full address range. 240904f2cbe3SMel Gorman */ 2410f83a275dSMel Gorman if (!(vma->vm_flags & VM_MAYSHARE) && 241104f2cbe3SMel Gorman is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 241204f2cbe3SMel Gorman old_page != pagecache_page) 241304f2cbe3SMel Gorman outside_reserve = 1; 241404f2cbe3SMel Gorman 24151e8f889bSDavid Gibson page_cache_get(old_page); 2416b76c8cfbSLarry Woodman 2417b76c8cfbSLarry Woodman /* Drop page_table_lock as buddy allocator may be called */ 2418b76c8cfbSLarry Woodman spin_unlock(&mm->page_table_lock); 241904f2cbe3SMel Gorman new_page = alloc_huge_page(vma, address, outside_reserve); 24201e8f889bSDavid Gibson 24212fc39cecSAdam Litke if (IS_ERR(new_page)) { 24221e8f889bSDavid Gibson page_cache_release(old_page); 242304f2cbe3SMel Gorman 242404f2cbe3SMel Gorman /* 242504f2cbe3SMel Gorman * If a process owning a MAP_PRIVATE mapping fails to COW, 242604f2cbe3SMel Gorman * it is due to references held by a child and an insufficient 242704f2cbe3SMel Gorman * huge page pool. To guarantee the original mappers 242804f2cbe3SMel Gorman * reliability, unmap the page from child processes. The child 242904f2cbe3SMel Gorman * may get SIGKILLed if it later faults. 243004f2cbe3SMel Gorman */ 243104f2cbe3SMel Gorman if (outside_reserve) { 243204f2cbe3SMel Gorman BUG_ON(huge_pte_none(pte)); 243304f2cbe3SMel Gorman if (unmap_ref_private(mm, vma, old_page, address)) { 243404f2cbe3SMel Gorman BUG_ON(page_count(old_page) != 1); 243504f2cbe3SMel Gorman BUG_ON(huge_pte_none(pte)); 2436b76c8cfbSLarry Woodman spin_lock(&mm->page_table_lock); 243704f2cbe3SMel Gorman goto retry_avoidcopy; 243804f2cbe3SMel Gorman } 243904f2cbe3SMel Gorman WARN_ON_ONCE(1); 244004f2cbe3SMel Gorman } 244104f2cbe3SMel Gorman 2442b76c8cfbSLarry Woodman /* Caller expects lock to be held */ 2443b76c8cfbSLarry Woodman spin_lock(&mm->page_table_lock); 24442fc39cecSAdam Litke return -PTR_ERR(new_page); 24451e8f889bSDavid Gibson } 24461e8f889bSDavid Gibson 24470fe6e20bSNaoya Horiguchi /* 24480fe6e20bSNaoya Horiguchi * When the original hugepage is shared one, it does not have 24490fe6e20bSNaoya Horiguchi * anon_vma prepared. 24500fe6e20bSNaoya Horiguchi */ 245144e2aa93SDean Nelson if (unlikely(anon_vma_prepare(vma))) { 245244e2aa93SDean Nelson /* Caller expects lock to be held */ 245344e2aa93SDean Nelson spin_lock(&mm->page_table_lock); 24540fe6e20bSNaoya Horiguchi return VM_FAULT_OOM; 245544e2aa93SDean Nelson } 24560fe6e20bSNaoya Horiguchi 24570ebabb41SNaoya Horiguchi copy_user_huge_page(new_page, old_page, address, vma); 24580ed361deSNick Piggin __SetPageUptodate(new_page); 24591e8f889bSDavid Gibson 2460b76c8cfbSLarry Woodman /* 2461b76c8cfbSLarry Woodman * Retake the page_table_lock to check for racing updates 2462b76c8cfbSLarry Woodman * before the page tables are altered 2463b76c8cfbSLarry Woodman */ 2464b76c8cfbSLarry Woodman spin_lock(&mm->page_table_lock); 2465a5516438SAndi Kleen ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 24667f2e9525SGerald Schaefer if (likely(pte_same(huge_ptep_get(ptep), pte))) { 24671e8f889bSDavid Gibson /* Break COW */ 24683edd4fc9SDoug Doan mmu_notifier_invalidate_range_start(mm, 24693edd4fc9SDoug Doan address & huge_page_mask(h), 24703edd4fc9SDoug Doan (address & huge_page_mask(h)) + huge_page_size(h)); 24718fe627ecSGerald Schaefer huge_ptep_clear_flush(vma, address, ptep); 24721e8f889bSDavid Gibson set_huge_pte_at(mm, address, ptep, 24731e8f889bSDavid Gibson make_huge_pte(vma, new_page, 1)); 24740fe6e20bSNaoya Horiguchi page_remove_rmap(old_page); 2475cd67f0d2SNaoya Horiguchi hugepage_add_new_anon_rmap(new_page, vma, address); 24761e8f889bSDavid Gibson /* Make the old page be freed below */ 24771e8f889bSDavid Gibson new_page = old_page; 24783edd4fc9SDoug Doan mmu_notifier_invalidate_range_end(mm, 24793edd4fc9SDoug Doan address & huge_page_mask(h), 24803edd4fc9SDoug Doan (address & huge_page_mask(h)) + huge_page_size(h)); 24811e8f889bSDavid Gibson } 24821e8f889bSDavid Gibson page_cache_release(new_page); 24831e8f889bSDavid Gibson page_cache_release(old_page); 248483c54070SNick Piggin return 0; 24851e8f889bSDavid Gibson } 24861e8f889bSDavid Gibson 248704f2cbe3SMel Gorman /* Return the pagecache page at a given address within a VMA */ 2488a5516438SAndi Kleen static struct page *hugetlbfs_pagecache_page(struct hstate *h, 2489a5516438SAndi Kleen struct vm_area_struct *vma, unsigned long address) 249004f2cbe3SMel Gorman { 249104f2cbe3SMel Gorman struct address_space *mapping; 2492e7c4b0bfSAndy Whitcroft pgoff_t idx; 249304f2cbe3SMel Gorman 249404f2cbe3SMel Gorman mapping = vma->vm_file->f_mapping; 2495a5516438SAndi Kleen idx = vma_hugecache_offset(h, vma, address); 249604f2cbe3SMel Gorman 249704f2cbe3SMel Gorman return find_lock_page(mapping, idx); 249804f2cbe3SMel Gorman } 249904f2cbe3SMel Gorman 25003ae77f43SHugh Dickins /* 25013ae77f43SHugh Dickins * Return whether there is a pagecache page to back given address within VMA. 25023ae77f43SHugh Dickins * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 25033ae77f43SHugh Dickins */ 25043ae77f43SHugh Dickins static bool hugetlbfs_pagecache_present(struct hstate *h, 25052a15efc9SHugh Dickins struct vm_area_struct *vma, unsigned long address) 25062a15efc9SHugh Dickins { 25072a15efc9SHugh Dickins struct address_space *mapping; 25082a15efc9SHugh Dickins pgoff_t idx; 25092a15efc9SHugh Dickins struct page *page; 25102a15efc9SHugh Dickins 25112a15efc9SHugh Dickins mapping = vma->vm_file->f_mapping; 25122a15efc9SHugh Dickins idx = vma_hugecache_offset(h, vma, address); 25132a15efc9SHugh Dickins 25142a15efc9SHugh Dickins page = find_get_page(mapping, idx); 25152a15efc9SHugh Dickins if (page) 25162a15efc9SHugh Dickins put_page(page); 25172a15efc9SHugh Dickins return page != NULL; 25182a15efc9SHugh Dickins } 25192a15efc9SHugh Dickins 2520a1ed3ddaSRobert P. J. Day static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2521788c7df4SHugh Dickins unsigned long address, pte_t *ptep, unsigned int flags) 2522ac9b9c66SHugh Dickins { 2523a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 2524ac9b9c66SHugh Dickins int ret = VM_FAULT_SIGBUS; 2525e7c4b0bfSAndy Whitcroft pgoff_t idx; 25264c887265SAdam Litke unsigned long size; 25274c887265SAdam Litke struct page *page; 25284c887265SAdam Litke struct address_space *mapping; 25291e8f889bSDavid Gibson pte_t new_pte; 25304c887265SAdam Litke 253104f2cbe3SMel Gorman /* 253204f2cbe3SMel Gorman * Currently, we are forced to kill the process in the event the 253304f2cbe3SMel Gorman * original mapper has unmapped pages from the child due to a failed 253404f2cbe3SMel Gorman * COW. Warn that such a situation has occured as it may not be obvious 253504f2cbe3SMel Gorman */ 253604f2cbe3SMel Gorman if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 253704f2cbe3SMel Gorman printk(KERN_WARNING 253804f2cbe3SMel Gorman "PID %d killed due to inadequate hugepage pool\n", 253904f2cbe3SMel Gorman current->pid); 254004f2cbe3SMel Gorman return ret; 254104f2cbe3SMel Gorman } 254204f2cbe3SMel Gorman 25434c887265SAdam Litke mapping = vma->vm_file->f_mapping; 2544a5516438SAndi Kleen idx = vma_hugecache_offset(h, vma, address); 25454c887265SAdam Litke 25464c887265SAdam Litke /* 25474c887265SAdam Litke * Use page lock to guard against racing truncation 25484c887265SAdam Litke * before we get page_table_lock. 25494c887265SAdam Litke */ 25506bda666aSChristoph Lameter retry: 25516bda666aSChristoph Lameter page = find_lock_page(mapping, idx); 25526bda666aSChristoph Lameter if (!page) { 2553a5516438SAndi Kleen size = i_size_read(mapping->host) >> huge_page_shift(h); 2554ebed4bfcSHugh Dickins if (idx >= size) 2555ebed4bfcSHugh Dickins goto out; 255604f2cbe3SMel Gorman page = alloc_huge_page(vma, address, 0); 25572fc39cecSAdam Litke if (IS_ERR(page)) { 25582fc39cecSAdam Litke ret = -PTR_ERR(page); 25596bda666aSChristoph Lameter goto out; 25606bda666aSChristoph Lameter } 2561a5516438SAndi Kleen clear_huge_page(page, address, huge_page_size(h)); 25620ed361deSNick Piggin __SetPageUptodate(page); 2563ac9b9c66SHugh Dickins 2564f83a275dSMel Gorman if (vma->vm_flags & VM_MAYSHARE) { 25656bda666aSChristoph Lameter int err; 256645c682a6SKen Chen struct inode *inode = mapping->host; 25676bda666aSChristoph Lameter 25686bda666aSChristoph Lameter err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 25696bda666aSChristoph Lameter if (err) { 25706bda666aSChristoph Lameter put_page(page); 25716bda666aSChristoph Lameter if (err == -EEXIST) 25726bda666aSChristoph Lameter goto retry; 25736bda666aSChristoph Lameter goto out; 25746bda666aSChristoph Lameter } 257545c682a6SKen Chen 257645c682a6SKen Chen spin_lock(&inode->i_lock); 2577a5516438SAndi Kleen inode->i_blocks += blocks_per_huge_page(h); 257845c682a6SKen Chen spin_unlock(&inode->i_lock); 25790fe6e20bSNaoya Horiguchi page_dup_rmap(page); 258023be7468SMel Gorman } else { 25816bda666aSChristoph Lameter lock_page(page); 25820fe6e20bSNaoya Horiguchi if (unlikely(anon_vma_prepare(vma))) { 25830fe6e20bSNaoya Horiguchi ret = VM_FAULT_OOM; 25840fe6e20bSNaoya Horiguchi goto backout_unlocked; 258523be7468SMel Gorman } 25860fe6e20bSNaoya Horiguchi hugepage_add_new_anon_rmap(page, vma, address); 25870fe6e20bSNaoya Horiguchi } 25880fe6e20bSNaoya Horiguchi } else { 258957303d80SAndy Whitcroft /* 2590998b4382SNaoya Horiguchi * If memory error occurs between mmap() and fault, some process 2591998b4382SNaoya Horiguchi * don't have hwpoisoned swap entry for errored virtual address. 2592998b4382SNaoya Horiguchi * So we need to block hugepage fault by PG_hwpoison bit check. 2593fd6a03edSNaoya Horiguchi */ 2594fd6a03edSNaoya Horiguchi if (unlikely(PageHWPoison(page))) { 2595aa50d3a7SAndi Kleen ret = VM_FAULT_HWPOISON | 2596aa50d3a7SAndi Kleen VM_FAULT_SET_HINDEX(h - hstates); 2597fd6a03edSNaoya Horiguchi goto backout_unlocked; 25986bda666aSChristoph Lameter } 2599998b4382SNaoya Horiguchi page_dup_rmap(page); 2600998b4382SNaoya Horiguchi } 26011e8f889bSDavid Gibson 260257303d80SAndy Whitcroft /* 260357303d80SAndy Whitcroft * If we are going to COW a private mapping later, we examine the 260457303d80SAndy Whitcroft * pending reservations for this page now. This will ensure that 260557303d80SAndy Whitcroft * any allocations necessary to record that reservation occur outside 260657303d80SAndy Whitcroft * the spinlock. 260757303d80SAndy Whitcroft */ 2608788c7df4SHugh Dickins if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 26092b26736cSAndy Whitcroft if (vma_needs_reservation(h, vma, address) < 0) { 26102b26736cSAndy Whitcroft ret = VM_FAULT_OOM; 26112b26736cSAndy Whitcroft goto backout_unlocked; 26122b26736cSAndy Whitcroft } 261357303d80SAndy Whitcroft 2614ac9b9c66SHugh Dickins spin_lock(&mm->page_table_lock); 2615a5516438SAndi Kleen size = i_size_read(mapping->host) >> huge_page_shift(h); 26164c887265SAdam Litke if (idx >= size) 26174c887265SAdam Litke goto backout; 26184c887265SAdam Litke 261983c54070SNick Piggin ret = 0; 26207f2e9525SGerald Schaefer if (!huge_pte_none(huge_ptep_get(ptep))) 26214c887265SAdam Litke goto backout; 26224c887265SAdam Litke 26231e8f889bSDavid Gibson new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 26241e8f889bSDavid Gibson && (vma->vm_flags & VM_SHARED))); 26251e8f889bSDavid Gibson set_huge_pte_at(mm, address, ptep, new_pte); 26261e8f889bSDavid Gibson 2627788c7df4SHugh Dickins if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 26281e8f889bSDavid Gibson /* Optimization, do the COW without a second fault */ 262904f2cbe3SMel Gorman ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 26301e8f889bSDavid Gibson } 26311e8f889bSDavid Gibson 2632ac9b9c66SHugh Dickins spin_unlock(&mm->page_table_lock); 26334c887265SAdam Litke unlock_page(page); 26344c887265SAdam Litke out: 2635ac9b9c66SHugh Dickins return ret; 26364c887265SAdam Litke 26374c887265SAdam Litke backout: 26384c887265SAdam Litke spin_unlock(&mm->page_table_lock); 26392b26736cSAndy Whitcroft backout_unlocked: 26404c887265SAdam Litke unlock_page(page); 26414c887265SAdam Litke put_page(page); 26424c887265SAdam Litke goto out; 2643ac9b9c66SHugh Dickins } 2644ac9b9c66SHugh Dickins 264586e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2646788c7df4SHugh Dickins unsigned long address, unsigned int flags) 264786e5216fSAdam Litke { 264886e5216fSAdam Litke pte_t *ptep; 264986e5216fSAdam Litke pte_t entry; 26501e8f889bSDavid Gibson int ret; 26510fe6e20bSNaoya Horiguchi struct page *page = NULL; 265257303d80SAndy Whitcroft struct page *pagecache_page = NULL; 26533935baa9SDavid Gibson static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2654a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 265586e5216fSAdam Litke 2656fd6a03edSNaoya Horiguchi ptep = huge_pte_offset(mm, address); 2657fd6a03edSNaoya Horiguchi if (ptep) { 2658fd6a03edSNaoya Horiguchi entry = huge_ptep_get(ptep); 2659290408d4SNaoya Horiguchi if (unlikely(is_hugetlb_entry_migration(entry))) { 2660290408d4SNaoya Horiguchi migration_entry_wait(mm, (pmd_t *)ptep, address); 2661290408d4SNaoya Horiguchi return 0; 2662290408d4SNaoya Horiguchi } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2663aa50d3a7SAndi Kleen return VM_FAULT_HWPOISON_LARGE | 2664aa50d3a7SAndi Kleen VM_FAULT_SET_HINDEX(h - hstates); 2665fd6a03edSNaoya Horiguchi } 2666fd6a03edSNaoya Horiguchi 2667a5516438SAndi Kleen ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 266886e5216fSAdam Litke if (!ptep) 266986e5216fSAdam Litke return VM_FAULT_OOM; 267086e5216fSAdam Litke 26713935baa9SDavid Gibson /* 26723935baa9SDavid Gibson * Serialize hugepage allocation and instantiation, so that we don't 26733935baa9SDavid Gibson * get spurious allocation failures if two CPUs race to instantiate 26743935baa9SDavid Gibson * the same page in the page cache. 26753935baa9SDavid Gibson */ 26763935baa9SDavid Gibson mutex_lock(&hugetlb_instantiation_mutex); 26777f2e9525SGerald Schaefer entry = huge_ptep_get(ptep); 26787f2e9525SGerald Schaefer if (huge_pte_none(entry)) { 2679788c7df4SHugh Dickins ret = hugetlb_no_page(mm, vma, address, ptep, flags); 2680b4d1d99fSDavid Gibson goto out_mutex; 26813935baa9SDavid Gibson } 268286e5216fSAdam Litke 268383c54070SNick Piggin ret = 0; 26841e8f889bSDavid Gibson 268557303d80SAndy Whitcroft /* 268657303d80SAndy Whitcroft * If we are going to COW the mapping later, we examine the pending 268757303d80SAndy Whitcroft * reservations for this page now. This will ensure that any 268857303d80SAndy Whitcroft * allocations necessary to record that reservation occur outside the 268957303d80SAndy Whitcroft * spinlock. For private mappings, we also lookup the pagecache 269057303d80SAndy Whitcroft * page now as it is used to determine if a reservation has been 269157303d80SAndy Whitcroft * consumed. 269257303d80SAndy Whitcroft */ 2693788c7df4SHugh Dickins if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { 26942b26736cSAndy Whitcroft if (vma_needs_reservation(h, vma, address) < 0) { 26952b26736cSAndy Whitcroft ret = VM_FAULT_OOM; 2696b4d1d99fSDavid Gibson goto out_mutex; 26972b26736cSAndy Whitcroft } 269857303d80SAndy Whitcroft 2699f83a275dSMel Gorman if (!(vma->vm_flags & VM_MAYSHARE)) 270057303d80SAndy Whitcroft pagecache_page = hugetlbfs_pagecache_page(h, 270157303d80SAndy Whitcroft vma, address); 270257303d80SAndy Whitcroft } 270357303d80SAndy Whitcroft 270456c9cfb1SNaoya Horiguchi /* 270556c9cfb1SNaoya Horiguchi * hugetlb_cow() requires page locks of pte_page(entry) and 270656c9cfb1SNaoya Horiguchi * pagecache_page, so here we need take the former one 270756c9cfb1SNaoya Horiguchi * when page != pagecache_page or !pagecache_page. 270856c9cfb1SNaoya Horiguchi * Note that locking order is always pagecache_page -> page, 270956c9cfb1SNaoya Horiguchi * so no worry about deadlock. 271056c9cfb1SNaoya Horiguchi */ 27110fe6e20bSNaoya Horiguchi page = pte_page(entry); 271256c9cfb1SNaoya Horiguchi if (page != pagecache_page) 27130fe6e20bSNaoya Horiguchi lock_page(page); 27140fe6e20bSNaoya Horiguchi 27151e8f889bSDavid Gibson spin_lock(&mm->page_table_lock); 27161e8f889bSDavid Gibson /* Check for a racing update before calling hugetlb_cow */ 2717b4d1d99fSDavid Gibson if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2718b4d1d99fSDavid Gibson goto out_page_table_lock; 2719b4d1d99fSDavid Gibson 2720b4d1d99fSDavid Gibson 2721788c7df4SHugh Dickins if (flags & FAULT_FLAG_WRITE) { 2722b4d1d99fSDavid Gibson if (!pte_write(entry)) { 272357303d80SAndy Whitcroft ret = hugetlb_cow(mm, vma, address, ptep, entry, 272457303d80SAndy Whitcroft pagecache_page); 2725b4d1d99fSDavid Gibson goto out_page_table_lock; 2726b4d1d99fSDavid Gibson } 2727b4d1d99fSDavid Gibson entry = pte_mkdirty(entry); 2728b4d1d99fSDavid Gibson } 2729b4d1d99fSDavid Gibson entry = pte_mkyoung(entry); 2730788c7df4SHugh Dickins if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2731788c7df4SHugh Dickins flags & FAULT_FLAG_WRITE)) 27324b3073e1SRussell King update_mmu_cache(vma, address, ptep); 2733b4d1d99fSDavid Gibson 2734b4d1d99fSDavid Gibson out_page_table_lock: 27351e8f889bSDavid Gibson spin_unlock(&mm->page_table_lock); 273657303d80SAndy Whitcroft 273757303d80SAndy Whitcroft if (pagecache_page) { 273857303d80SAndy Whitcroft unlock_page(pagecache_page); 273957303d80SAndy Whitcroft put_page(pagecache_page); 274057303d80SAndy Whitcroft } 274156c9cfb1SNaoya Horiguchi unlock_page(page); 274257303d80SAndy Whitcroft 2743b4d1d99fSDavid Gibson out_mutex: 27443935baa9SDavid Gibson mutex_unlock(&hugetlb_instantiation_mutex); 27451e8f889bSDavid Gibson 27461e8f889bSDavid Gibson return ret; 274786e5216fSAdam Litke } 274886e5216fSAdam Litke 2749ceb86879SAndi Kleen /* Can be overriden by architectures */ 2750ceb86879SAndi Kleen __attribute__((weak)) struct page * 2751ceb86879SAndi Kleen follow_huge_pud(struct mm_struct *mm, unsigned long address, 2752ceb86879SAndi Kleen pud_t *pud, int write) 2753ceb86879SAndi Kleen { 2754ceb86879SAndi Kleen BUG(); 2755ceb86879SAndi Kleen return NULL; 2756ceb86879SAndi Kleen } 2757ceb86879SAndi Kleen 275863551ae0SDavid Gibson int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 275963551ae0SDavid Gibson struct page **pages, struct vm_area_struct **vmas, 27605b23dbe8SAdam Litke unsigned long *position, int *length, int i, 27612a15efc9SHugh Dickins unsigned int flags) 276263551ae0SDavid Gibson { 2763d5d4b0aaSChen, Kenneth W unsigned long pfn_offset; 2764d5d4b0aaSChen, Kenneth W unsigned long vaddr = *position; 276563551ae0SDavid Gibson int remainder = *length; 2766a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 276763551ae0SDavid Gibson 27681c59827dSHugh Dickins spin_lock(&mm->page_table_lock); 276963551ae0SDavid Gibson while (vaddr < vma->vm_end && remainder) { 277063551ae0SDavid Gibson pte_t *pte; 27712a15efc9SHugh Dickins int absent; 277263551ae0SDavid Gibson struct page *page; 277363551ae0SDavid Gibson 27744c887265SAdam Litke /* 27754c887265SAdam Litke * Some archs (sparc64, sh*) have multiple pte_ts to 27762a15efc9SHugh Dickins * each hugepage. We have to make sure we get the 27774c887265SAdam Litke * first, for the page indexing below to work. 27784c887265SAdam Litke */ 2779a5516438SAndi Kleen pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 27802a15efc9SHugh Dickins absent = !pte || huge_pte_none(huge_ptep_get(pte)); 278163551ae0SDavid Gibson 27822a15efc9SHugh Dickins /* 27832a15efc9SHugh Dickins * When coredumping, it suits get_dump_page if we just return 27843ae77f43SHugh Dickins * an error where there's an empty slot with no huge pagecache 27853ae77f43SHugh Dickins * to back it. This way, we avoid allocating a hugepage, and 27863ae77f43SHugh Dickins * the sparse dumpfile avoids allocating disk blocks, but its 27873ae77f43SHugh Dickins * huge holes still show up with zeroes where they need to be. 27882a15efc9SHugh Dickins */ 27893ae77f43SHugh Dickins if (absent && (flags & FOLL_DUMP) && 27903ae77f43SHugh Dickins !hugetlbfs_pagecache_present(h, vma, vaddr)) { 27912a15efc9SHugh Dickins remainder = 0; 27922a15efc9SHugh Dickins break; 27932a15efc9SHugh Dickins } 27942a15efc9SHugh Dickins 27952a15efc9SHugh Dickins if (absent || 27962a15efc9SHugh Dickins ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { 27974c887265SAdam Litke int ret; 27984c887265SAdam Litke 27994c887265SAdam Litke spin_unlock(&mm->page_table_lock); 28002a15efc9SHugh Dickins ret = hugetlb_fault(mm, vma, vaddr, 28012a15efc9SHugh Dickins (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 28024c887265SAdam Litke spin_lock(&mm->page_table_lock); 2803a89182c7SAdam Litke if (!(ret & VM_FAULT_ERROR)) 28044c887265SAdam Litke continue; 28054c887265SAdam Litke 28061c59827dSHugh Dickins remainder = 0; 28071c59827dSHugh Dickins break; 28081c59827dSHugh Dickins } 280963551ae0SDavid Gibson 2810a5516438SAndi Kleen pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 28117f2e9525SGerald Schaefer page = pte_page(huge_ptep_get(pte)); 2812d5d4b0aaSChen, Kenneth W same_page: 2813d6692183SChen, Kenneth W if (pages) { 281469d177c2SAndy Whitcroft pages[i] = mem_map_offset(page, pfn_offset); 28154b2e38adSKOSAKI Motohiro get_page(pages[i]); 2816d6692183SChen, Kenneth W } 281763551ae0SDavid Gibson 281863551ae0SDavid Gibson if (vmas) 281963551ae0SDavid Gibson vmas[i] = vma; 282063551ae0SDavid Gibson 282163551ae0SDavid Gibson vaddr += PAGE_SIZE; 2822d5d4b0aaSChen, Kenneth W ++pfn_offset; 282363551ae0SDavid Gibson --remainder; 282463551ae0SDavid Gibson ++i; 2825d5d4b0aaSChen, Kenneth W if (vaddr < vma->vm_end && remainder && 2826a5516438SAndi Kleen pfn_offset < pages_per_huge_page(h)) { 2827d5d4b0aaSChen, Kenneth W /* 2828d5d4b0aaSChen, Kenneth W * We use pfn_offset to avoid touching the pageframes 2829d5d4b0aaSChen, Kenneth W * of this compound page. 2830d5d4b0aaSChen, Kenneth W */ 2831d5d4b0aaSChen, Kenneth W goto same_page; 2832d5d4b0aaSChen, Kenneth W } 283363551ae0SDavid Gibson } 28341c59827dSHugh Dickins spin_unlock(&mm->page_table_lock); 283563551ae0SDavid Gibson *length = remainder; 283663551ae0SDavid Gibson *position = vaddr; 283763551ae0SDavid Gibson 28382a15efc9SHugh Dickins return i ? i : -EFAULT; 283963551ae0SDavid Gibson } 28408f860591SZhang, Yanmin 28418f860591SZhang, Yanmin void hugetlb_change_protection(struct vm_area_struct *vma, 28428f860591SZhang, Yanmin unsigned long address, unsigned long end, pgprot_t newprot) 28438f860591SZhang, Yanmin { 28448f860591SZhang, Yanmin struct mm_struct *mm = vma->vm_mm; 28458f860591SZhang, Yanmin unsigned long start = address; 28468f860591SZhang, Yanmin pte_t *ptep; 28478f860591SZhang, Yanmin pte_t pte; 2848a5516438SAndi Kleen struct hstate *h = hstate_vma(vma); 28498f860591SZhang, Yanmin 28508f860591SZhang, Yanmin BUG_ON(address >= end); 28518f860591SZhang, Yanmin flush_cache_range(vma, address, end); 28528f860591SZhang, Yanmin 285339dde65cSChen, Kenneth W spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 28548f860591SZhang, Yanmin spin_lock(&mm->page_table_lock); 2855a5516438SAndi Kleen for (; address < end; address += huge_page_size(h)) { 28568f860591SZhang, Yanmin ptep = huge_pte_offset(mm, address); 28578f860591SZhang, Yanmin if (!ptep) 28588f860591SZhang, Yanmin continue; 285939dde65cSChen, Kenneth W if (huge_pmd_unshare(mm, &address, ptep)) 286039dde65cSChen, Kenneth W continue; 28617f2e9525SGerald Schaefer if (!huge_pte_none(huge_ptep_get(ptep))) { 28628f860591SZhang, Yanmin pte = huge_ptep_get_and_clear(mm, address, ptep); 28638f860591SZhang, Yanmin pte = pte_mkhuge(pte_modify(pte, newprot)); 28648f860591SZhang, Yanmin set_huge_pte_at(mm, address, ptep, pte); 28658f860591SZhang, Yanmin } 28668f860591SZhang, Yanmin } 28678f860591SZhang, Yanmin spin_unlock(&mm->page_table_lock); 286839dde65cSChen, Kenneth W spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 28698f860591SZhang, Yanmin 28708f860591SZhang, Yanmin flush_tlb_range(vma, start, end); 28718f860591SZhang, Yanmin } 28728f860591SZhang, Yanmin 2873a1e78772SMel Gorman int hugetlb_reserve_pages(struct inode *inode, 2874a1e78772SMel Gorman long from, long to, 28755a6fe125SMel Gorman struct vm_area_struct *vma, 28765a6fe125SMel Gorman int acctflag) 2877e4e574b7SAdam Litke { 287817c9d12eSMel Gorman long ret, chg; 2879a5516438SAndi Kleen struct hstate *h = hstate_inode(inode); 2880e4e574b7SAdam Litke 2881a1e78772SMel Gorman /* 288217c9d12eSMel Gorman * Only apply hugepage reservation if asked. At fault time, an 288317c9d12eSMel Gorman * attempt will be made for VM_NORESERVE to allocate a page 288417c9d12eSMel Gorman * and filesystem quota without using reserves 288517c9d12eSMel Gorman */ 288617c9d12eSMel Gorman if (acctflag & VM_NORESERVE) 288717c9d12eSMel Gorman return 0; 288817c9d12eSMel Gorman 288917c9d12eSMel Gorman /* 2890a1e78772SMel Gorman * Shared mappings base their reservation on the number of pages that 2891a1e78772SMel Gorman * are already allocated on behalf of the file. Private mappings need 2892a1e78772SMel Gorman * to reserve the full area even if read-only as mprotect() may be 2893a1e78772SMel Gorman * called to make the mapping read-write. Assume !vma is a shm mapping 2894a1e78772SMel Gorman */ 2895f83a275dSMel Gorman if (!vma || vma->vm_flags & VM_MAYSHARE) 2896e4e574b7SAdam Litke chg = region_chg(&inode->i_mapping->private_list, from, to); 28975a6fe125SMel Gorman else { 28985a6fe125SMel Gorman struct resv_map *resv_map = resv_map_alloc(); 28995a6fe125SMel Gorman if (!resv_map) 29005a6fe125SMel Gorman return -ENOMEM; 29015a6fe125SMel Gorman 290217c9d12eSMel Gorman chg = to - from; 290317c9d12eSMel Gorman 29045a6fe125SMel Gorman set_vma_resv_map(vma, resv_map); 29055a6fe125SMel Gorman set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 29065a6fe125SMel Gorman } 29075a6fe125SMel Gorman 290817c9d12eSMel Gorman if (chg < 0) 290917c9d12eSMel Gorman return chg; 291017c9d12eSMel Gorman 291117c9d12eSMel Gorman /* There must be enough filesystem quota for the mapping */ 291217c9d12eSMel Gorman if (hugetlb_get_quota(inode->i_mapping, chg)) 291317c9d12eSMel Gorman return -ENOSPC; 291417c9d12eSMel Gorman 291517c9d12eSMel Gorman /* 291617c9d12eSMel Gorman * Check enough hugepages are available for the reservation. 291717c9d12eSMel Gorman * Hand back the quota if there are not 291817c9d12eSMel Gorman */ 291917c9d12eSMel Gorman ret = hugetlb_acct_memory(h, chg); 292017c9d12eSMel Gorman if (ret < 0) { 292117c9d12eSMel Gorman hugetlb_put_quota(inode->i_mapping, chg); 292217c9d12eSMel Gorman return ret; 292317c9d12eSMel Gorman } 292417c9d12eSMel Gorman 292517c9d12eSMel Gorman /* 292617c9d12eSMel Gorman * Account for the reservations made. Shared mappings record regions 292717c9d12eSMel Gorman * that have reservations as they are shared by multiple VMAs. 292817c9d12eSMel Gorman * When the last VMA disappears, the region map says how much 292917c9d12eSMel Gorman * the reservation was and the page cache tells how much of 293017c9d12eSMel Gorman * the reservation was consumed. Private mappings are per-VMA and 293117c9d12eSMel Gorman * only the consumed reservations are tracked. When the VMA 293217c9d12eSMel Gorman * disappears, the original reservation is the VMA size and the 293317c9d12eSMel Gorman * consumed reservations are stored in the map. Hence, nothing 293417c9d12eSMel Gorman * else has to be done for private mappings here 293517c9d12eSMel Gorman */ 2936f83a275dSMel Gorman if (!vma || vma->vm_flags & VM_MAYSHARE) 293717c9d12eSMel Gorman region_add(&inode->i_mapping->private_list, from, to); 2938a43a8c39SChen, Kenneth W return 0; 2939a43a8c39SChen, Kenneth W } 2940a43a8c39SChen, Kenneth W 2941a43a8c39SChen, Kenneth W void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2942a43a8c39SChen, Kenneth W { 2943a5516438SAndi Kleen struct hstate *h = hstate_inode(inode); 2944a43a8c39SChen, Kenneth W long chg = region_truncate(&inode->i_mapping->private_list, offset); 294545c682a6SKen Chen 294645c682a6SKen Chen spin_lock(&inode->i_lock); 2947e4c6f8beSEric Sandeen inode->i_blocks -= (blocks_per_huge_page(h) * freed); 294845c682a6SKen Chen spin_unlock(&inode->i_lock); 294945c682a6SKen Chen 295090d8b7e6SAdam Litke hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2951a5516438SAndi Kleen hugetlb_acct_memory(h, -(chg - freed)); 2952a43a8c39SChen, Kenneth W } 295393f70f90SNaoya Horiguchi 2954d5bd9106SAndi Kleen #ifdef CONFIG_MEMORY_FAILURE 2955d5bd9106SAndi Kleen 29566de2b1aaSNaoya Horiguchi /* Should be called in hugetlb_lock */ 29576de2b1aaSNaoya Horiguchi static int is_hugepage_on_freelist(struct page *hpage) 29586de2b1aaSNaoya Horiguchi { 29596de2b1aaSNaoya Horiguchi struct page *page; 29606de2b1aaSNaoya Horiguchi struct page *tmp; 29616de2b1aaSNaoya Horiguchi struct hstate *h = page_hstate(hpage); 29626de2b1aaSNaoya Horiguchi int nid = page_to_nid(hpage); 29636de2b1aaSNaoya Horiguchi 29646de2b1aaSNaoya Horiguchi list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) 29656de2b1aaSNaoya Horiguchi if (page == hpage) 29666de2b1aaSNaoya Horiguchi return 1; 29676de2b1aaSNaoya Horiguchi return 0; 29686de2b1aaSNaoya Horiguchi } 29696de2b1aaSNaoya Horiguchi 297093f70f90SNaoya Horiguchi /* 297193f70f90SNaoya Horiguchi * This function is called from memory failure code. 297293f70f90SNaoya Horiguchi * Assume the caller holds page lock of the head page. 297393f70f90SNaoya Horiguchi */ 29746de2b1aaSNaoya Horiguchi int dequeue_hwpoisoned_huge_page(struct page *hpage) 297593f70f90SNaoya Horiguchi { 297693f70f90SNaoya Horiguchi struct hstate *h = page_hstate(hpage); 297793f70f90SNaoya Horiguchi int nid = page_to_nid(hpage); 29786de2b1aaSNaoya Horiguchi int ret = -EBUSY; 297993f70f90SNaoya Horiguchi 298093f70f90SNaoya Horiguchi spin_lock(&hugetlb_lock); 29816de2b1aaSNaoya Horiguchi if (is_hugepage_on_freelist(hpage)) { 298293f70f90SNaoya Horiguchi list_del(&hpage->lru); 29838c6c2ecbSNaoya Horiguchi set_page_refcounted(hpage); 298493f70f90SNaoya Horiguchi h->free_huge_pages--; 298593f70f90SNaoya Horiguchi h->free_huge_pages_node[nid]--; 29866de2b1aaSNaoya Horiguchi ret = 0; 298793f70f90SNaoya Horiguchi } 29886de2b1aaSNaoya Horiguchi spin_unlock(&hugetlb_lock); 29896de2b1aaSNaoya Horiguchi return ret; 29906de2b1aaSNaoya Horiguchi } 29916de2b1aaSNaoya Horiguchi #endif 2992