1 /* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5 #include <linux/gfp.h> 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/module.h> 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <asm/page.h> 15 #include <asm/pgtable.h> 16 17 #include <linux/hugetlb.h> 18 19 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 20 static unsigned long nr_huge_pages, free_huge_pages; 21 unsigned long max_huge_pages; 22 static struct list_head hugepage_freelists[MAX_NUMNODES]; 23 static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 24 static unsigned int free_huge_pages_node[MAX_NUMNODES]; 25 static DEFINE_SPINLOCK(hugetlb_lock); 26 27 static void enqueue_huge_page(struct page *page) 28 { 29 int nid = page_to_nid(page); 30 list_add(&page->lru, &hugepage_freelists[nid]); 31 free_huge_pages++; 32 free_huge_pages_node[nid]++; 33 } 34 35 static struct page *dequeue_huge_page(void) 36 { 37 int nid = numa_node_id(); 38 struct page *page = NULL; 39 40 if (list_empty(&hugepage_freelists[nid])) { 41 for (nid = 0; nid < MAX_NUMNODES; ++nid) 42 if (!list_empty(&hugepage_freelists[nid])) 43 break; 44 } 45 if (nid >= 0 && nid < MAX_NUMNODES && 46 !list_empty(&hugepage_freelists[nid])) { 47 page = list_entry(hugepage_freelists[nid].next, 48 struct page, lru); 49 list_del(&page->lru); 50 free_huge_pages--; 51 free_huge_pages_node[nid]--; 52 } 53 return page; 54 } 55 56 static struct page *alloc_fresh_huge_page(void) 57 { 58 static int nid = 0; 59 struct page *page; 60 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 61 HUGETLB_PAGE_ORDER); 62 nid = (nid + 1) % num_online_nodes(); 63 if (page) { 64 nr_huge_pages++; 65 nr_huge_pages_node[page_to_nid(page)]++; 66 } 67 return page; 68 } 69 70 void free_huge_page(struct page *page) 71 { 72 BUG_ON(page_count(page)); 73 74 INIT_LIST_HEAD(&page->lru); 75 page[1].mapping = NULL; 76 77 spin_lock(&hugetlb_lock); 78 enqueue_huge_page(page); 79 spin_unlock(&hugetlb_lock); 80 } 81 82 struct page *alloc_huge_page(void) 83 { 84 struct page *page; 85 int i; 86 87 spin_lock(&hugetlb_lock); 88 page = dequeue_huge_page(); 89 if (!page) { 90 spin_unlock(&hugetlb_lock); 91 return NULL; 92 } 93 spin_unlock(&hugetlb_lock); 94 set_page_count(page, 1); 95 page[1].mapping = (void *)free_huge_page; 96 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 97 clear_highpage(&page[i]); 98 return page; 99 } 100 101 static int __init hugetlb_init(void) 102 { 103 unsigned long i; 104 struct page *page; 105 106 if (HPAGE_SHIFT == 0) 107 return 0; 108 109 for (i = 0; i < MAX_NUMNODES; ++i) 110 INIT_LIST_HEAD(&hugepage_freelists[i]); 111 112 for (i = 0; i < max_huge_pages; ++i) { 113 page = alloc_fresh_huge_page(); 114 if (!page) 115 break; 116 spin_lock(&hugetlb_lock); 117 enqueue_huge_page(page); 118 spin_unlock(&hugetlb_lock); 119 } 120 max_huge_pages = free_huge_pages = nr_huge_pages = i; 121 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 122 return 0; 123 } 124 module_init(hugetlb_init); 125 126 static int __init hugetlb_setup(char *s) 127 { 128 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 129 max_huge_pages = 0; 130 return 1; 131 } 132 __setup("hugepages=", hugetlb_setup); 133 134 #ifdef CONFIG_SYSCTL 135 static void update_and_free_page(struct page *page) 136 { 137 int i; 138 nr_huge_pages--; 139 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 140 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 141 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 142 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 143 1 << PG_private | 1<< PG_writeback); 144 set_page_count(&page[i], 0); 145 } 146 set_page_count(page, 1); 147 __free_pages(page, HUGETLB_PAGE_ORDER); 148 } 149 150 #ifdef CONFIG_HIGHMEM 151 static void try_to_free_low(unsigned long count) 152 { 153 int i, nid; 154 for (i = 0; i < MAX_NUMNODES; ++i) { 155 struct page *page, *next; 156 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 157 if (PageHighMem(page)) 158 continue; 159 list_del(&page->lru); 160 update_and_free_page(page); 161 nid = page_zone(page)->zone_pgdat->node_id; 162 free_huge_pages--; 163 free_huge_pages_node[nid]--; 164 if (count >= nr_huge_pages) 165 return; 166 } 167 } 168 } 169 #else 170 static inline void try_to_free_low(unsigned long count) 171 { 172 } 173 #endif 174 175 static unsigned long set_max_huge_pages(unsigned long count) 176 { 177 while (count > nr_huge_pages) { 178 struct page *page = alloc_fresh_huge_page(); 179 if (!page) 180 return nr_huge_pages; 181 spin_lock(&hugetlb_lock); 182 enqueue_huge_page(page); 183 spin_unlock(&hugetlb_lock); 184 } 185 if (count >= nr_huge_pages) 186 return nr_huge_pages; 187 188 spin_lock(&hugetlb_lock); 189 try_to_free_low(count); 190 while (count < nr_huge_pages) { 191 struct page *page = dequeue_huge_page(); 192 if (!page) 193 break; 194 update_and_free_page(page); 195 } 196 spin_unlock(&hugetlb_lock); 197 return nr_huge_pages; 198 } 199 200 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 201 struct file *file, void __user *buffer, 202 size_t *length, loff_t *ppos) 203 { 204 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 205 max_huge_pages = set_max_huge_pages(max_huge_pages); 206 return 0; 207 } 208 #endif /* CONFIG_SYSCTL */ 209 210 int hugetlb_report_meminfo(char *buf) 211 { 212 return sprintf(buf, 213 "HugePages_Total: %5lu\n" 214 "HugePages_Free: %5lu\n" 215 "Hugepagesize: %5lu kB\n", 216 nr_huge_pages, 217 free_huge_pages, 218 HPAGE_SIZE/1024); 219 } 220 221 int hugetlb_report_node_meminfo(int nid, char *buf) 222 { 223 return sprintf(buf, 224 "Node %d HugePages_Total: %5u\n" 225 "Node %d HugePages_Free: %5u\n", 226 nid, nr_huge_pages_node[nid], 227 nid, free_huge_pages_node[nid]); 228 } 229 230 int is_hugepage_mem_enough(size_t size) 231 { 232 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; 233 } 234 235 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 236 unsigned long hugetlb_total_pages(void) 237 { 238 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 239 } 240 EXPORT_SYMBOL(hugetlb_total_pages); 241 242 /* 243 * We cannot handle pagefaults against hugetlb pages at all. They cause 244 * handle_mm_fault() to try to instantiate regular-sized pages in the 245 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 246 * this far. 247 */ 248 static struct page *hugetlb_nopage(struct vm_area_struct *vma, 249 unsigned long address, int *unused) 250 { 251 BUG(); 252 return NULL; 253 } 254 255 struct vm_operations_struct hugetlb_vm_ops = { 256 .nopage = hugetlb_nopage, 257 }; 258 259 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 260 { 261 pte_t entry; 262 263 if (vma->vm_flags & VM_WRITE) { 264 entry = 265 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 266 } else { 267 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 268 } 269 entry = pte_mkyoung(entry); 270 entry = pte_mkhuge(entry); 271 272 return entry; 273 } 274 275 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 276 struct vm_area_struct *vma) 277 { 278 pte_t *src_pte, *dst_pte, entry; 279 struct page *ptepage; 280 unsigned long addr; 281 282 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 283 src_pte = huge_pte_offset(src, addr); 284 if (!src_pte) 285 continue; 286 dst_pte = huge_pte_alloc(dst, addr); 287 if (!dst_pte) 288 goto nomem; 289 spin_lock(&dst->page_table_lock); 290 spin_lock(&src->page_table_lock); 291 if (!pte_none(*src_pte)) { 292 entry = *src_pte; 293 ptepage = pte_page(entry); 294 get_page(ptepage); 295 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); 296 set_huge_pte_at(dst, addr, dst_pte, entry); 297 } 298 spin_unlock(&src->page_table_lock); 299 spin_unlock(&dst->page_table_lock); 300 } 301 return 0; 302 303 nomem: 304 return -ENOMEM; 305 } 306 307 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 308 unsigned long end) 309 { 310 struct mm_struct *mm = vma->vm_mm; 311 unsigned long address; 312 pte_t *ptep; 313 pte_t pte; 314 struct page *page; 315 316 WARN_ON(!is_vm_hugetlb_page(vma)); 317 BUG_ON(start & ~HPAGE_MASK); 318 BUG_ON(end & ~HPAGE_MASK); 319 320 spin_lock(&mm->page_table_lock); 321 322 /* Update high watermark before we lower rss */ 323 update_hiwater_rss(mm); 324 325 for (address = start; address < end; address += HPAGE_SIZE) { 326 ptep = huge_pte_offset(mm, address); 327 if (!ptep) 328 continue; 329 330 pte = huge_ptep_get_and_clear(mm, address, ptep); 331 if (pte_none(pte)) 332 continue; 333 334 page = pte_page(pte); 335 put_page(page); 336 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); 337 } 338 339 spin_unlock(&mm->page_table_lock); 340 flush_tlb_range(vma, start, end); 341 } 342 343 static struct page *find_lock_huge_page(struct address_space *mapping, 344 unsigned long idx) 345 { 346 struct page *page; 347 int err; 348 struct inode *inode = mapping->host; 349 unsigned long size; 350 351 retry: 352 page = find_lock_page(mapping, idx); 353 if (page) 354 goto out; 355 356 /* Check to make sure the mapping hasn't been truncated */ 357 size = i_size_read(inode) >> HPAGE_SHIFT; 358 if (idx >= size) 359 goto out; 360 361 if (hugetlb_get_quota(mapping)) 362 goto out; 363 page = alloc_huge_page(); 364 if (!page) { 365 hugetlb_put_quota(mapping); 366 goto out; 367 } 368 369 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 370 if (err) { 371 put_page(page); 372 hugetlb_put_quota(mapping); 373 if (err == -EEXIST) 374 goto retry; 375 page = NULL; 376 } 377 out: 378 return page; 379 } 380 381 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 382 unsigned long address, int write_access) 383 { 384 int ret = VM_FAULT_SIGBUS; 385 unsigned long idx; 386 unsigned long size; 387 pte_t *pte; 388 struct page *page; 389 struct address_space *mapping; 390 391 pte = huge_pte_alloc(mm, address); 392 if (!pte) 393 goto out; 394 395 mapping = vma->vm_file->f_mapping; 396 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 397 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 398 399 /* 400 * Use page lock to guard against racing truncation 401 * before we get page_table_lock. 402 */ 403 page = find_lock_huge_page(mapping, idx); 404 if (!page) 405 goto out; 406 407 spin_lock(&mm->page_table_lock); 408 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 409 if (idx >= size) 410 goto backout; 411 412 ret = VM_FAULT_MINOR; 413 if (!pte_none(*pte)) 414 goto backout; 415 416 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 417 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); 418 spin_unlock(&mm->page_table_lock); 419 unlock_page(page); 420 out: 421 return ret; 422 423 backout: 424 spin_unlock(&mm->page_table_lock); 425 hugetlb_put_quota(mapping); 426 unlock_page(page); 427 put_page(page); 428 goto out; 429 } 430 431 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 432 struct page **pages, struct vm_area_struct **vmas, 433 unsigned long *position, int *length, int i) 434 { 435 unsigned long vpfn, vaddr = *position; 436 int remainder = *length; 437 438 vpfn = vaddr/PAGE_SIZE; 439 spin_lock(&mm->page_table_lock); 440 while (vaddr < vma->vm_end && remainder) { 441 pte_t *pte; 442 struct page *page; 443 444 /* 445 * Some archs (sparc64, sh*) have multiple pte_ts to 446 * each hugepage. We have to make * sure we get the 447 * first, for the page indexing below to work. 448 */ 449 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 450 451 if (!pte || pte_none(*pte)) { 452 int ret; 453 454 spin_unlock(&mm->page_table_lock); 455 ret = hugetlb_fault(mm, vma, vaddr, 0); 456 spin_lock(&mm->page_table_lock); 457 if (ret == VM_FAULT_MINOR) 458 continue; 459 460 remainder = 0; 461 if (!i) 462 i = -EFAULT; 463 break; 464 } 465 466 if (pages) { 467 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 468 get_page(page); 469 pages[i] = page; 470 } 471 472 if (vmas) 473 vmas[i] = vma; 474 475 vaddr += PAGE_SIZE; 476 ++vpfn; 477 --remainder; 478 ++i; 479 } 480 spin_unlock(&mm->page_table_lock); 481 *length = remainder; 482 *position = vaddr; 483 484 return i; 485 } 486