1 /* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5 #include <linux/gfp.h> 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/module.h> 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <linux/mempolicy.h> 15 #include <linux/cpuset.h> 16 17 #include <asm/page.h> 18 #include <asm/pgtable.h> 19 20 #include <linux/hugetlb.h> 21 22 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 23 static unsigned long nr_huge_pages, free_huge_pages; 24 unsigned long max_huge_pages; 25 static struct list_head hugepage_freelists[MAX_NUMNODES]; 26 static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 27 static unsigned int free_huge_pages_node[MAX_NUMNODES]; 28 29 /* 30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 31 */ 32 static DEFINE_SPINLOCK(hugetlb_lock); 33 34 static void enqueue_huge_page(struct page *page) 35 { 36 int nid = page_to_nid(page); 37 list_add(&page->lru, &hugepage_freelists[nid]); 38 free_huge_pages++; 39 free_huge_pages_node[nid]++; 40 } 41 42 static struct page *dequeue_huge_page(struct vm_area_struct *vma, 43 unsigned long address) 44 { 45 int nid = numa_node_id(); 46 struct page *page = NULL; 47 struct zonelist *zonelist = huge_zonelist(vma, address); 48 struct zone **z; 49 50 for (z = zonelist->zones; *z; z++) { 51 nid = (*z)->zone_pgdat->node_id; 52 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 53 !list_empty(&hugepage_freelists[nid])) 54 break; 55 } 56 57 if (*z) { 58 page = list_entry(hugepage_freelists[nid].next, 59 struct page, lru); 60 list_del(&page->lru); 61 free_huge_pages--; 62 free_huge_pages_node[nid]--; 63 } 64 return page; 65 } 66 67 static int alloc_fresh_huge_page(void) 68 { 69 static int nid = 0; 70 struct page *page; 71 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 72 HUGETLB_PAGE_ORDER); 73 nid = (nid + 1) % num_online_nodes(); 74 if (page) { 75 page[1].lru.next = (void *)free_huge_page; /* dtor */ 76 spin_lock(&hugetlb_lock); 77 nr_huge_pages++; 78 nr_huge_pages_node[page_to_nid(page)]++; 79 spin_unlock(&hugetlb_lock); 80 put_page(page); /* free it into the hugepage allocator */ 81 return 1; 82 } 83 return 0; 84 } 85 86 void free_huge_page(struct page *page) 87 { 88 BUG_ON(page_count(page)); 89 90 INIT_LIST_HEAD(&page->lru); 91 92 spin_lock(&hugetlb_lock); 93 enqueue_huge_page(page); 94 spin_unlock(&hugetlb_lock); 95 } 96 97 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 98 { 99 struct page *page; 100 int i; 101 102 spin_lock(&hugetlb_lock); 103 page = dequeue_huge_page(vma, addr); 104 if (!page) { 105 spin_unlock(&hugetlb_lock); 106 return NULL; 107 } 108 spin_unlock(&hugetlb_lock); 109 set_page_count(page, 1); 110 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 111 clear_user_highpage(&page[i], addr); 112 return page; 113 } 114 115 static int __init hugetlb_init(void) 116 { 117 unsigned long i; 118 119 if (HPAGE_SHIFT == 0) 120 return 0; 121 122 for (i = 0; i < MAX_NUMNODES; ++i) 123 INIT_LIST_HEAD(&hugepage_freelists[i]); 124 125 for (i = 0; i < max_huge_pages; ++i) { 126 if (!alloc_fresh_huge_page()) 127 break; 128 } 129 max_huge_pages = free_huge_pages = nr_huge_pages = i; 130 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 131 return 0; 132 } 133 module_init(hugetlb_init); 134 135 static int __init hugetlb_setup(char *s) 136 { 137 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 138 max_huge_pages = 0; 139 return 1; 140 } 141 __setup("hugepages=", hugetlb_setup); 142 143 #ifdef CONFIG_SYSCTL 144 static void update_and_free_page(struct page *page) 145 { 146 int i; 147 nr_huge_pages--; 148 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 149 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 150 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 151 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 152 1 << PG_private | 1<< PG_writeback); 153 } 154 page[1].lru.next = NULL; 155 set_page_count(page, 1); 156 __free_pages(page, HUGETLB_PAGE_ORDER); 157 } 158 159 #ifdef CONFIG_HIGHMEM 160 static void try_to_free_low(unsigned long count) 161 { 162 int i, nid; 163 for (i = 0; i < MAX_NUMNODES; ++i) { 164 struct page *page, *next; 165 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 166 if (PageHighMem(page)) 167 continue; 168 list_del(&page->lru); 169 update_and_free_page(page); 170 nid = page_zone(page)->zone_pgdat->node_id; 171 free_huge_pages--; 172 free_huge_pages_node[nid]--; 173 if (count >= nr_huge_pages) 174 return; 175 } 176 } 177 } 178 #else 179 static inline void try_to_free_low(unsigned long count) 180 { 181 } 182 #endif 183 184 static unsigned long set_max_huge_pages(unsigned long count) 185 { 186 while (count > nr_huge_pages) { 187 if (!alloc_fresh_huge_page()) 188 return nr_huge_pages; 189 } 190 if (count >= nr_huge_pages) 191 return nr_huge_pages; 192 193 spin_lock(&hugetlb_lock); 194 try_to_free_low(count); 195 while (count < nr_huge_pages) { 196 struct page *page = dequeue_huge_page(NULL, 0); 197 if (!page) 198 break; 199 update_and_free_page(page); 200 } 201 spin_unlock(&hugetlb_lock); 202 return nr_huge_pages; 203 } 204 205 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 206 struct file *file, void __user *buffer, 207 size_t *length, loff_t *ppos) 208 { 209 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 210 max_huge_pages = set_max_huge_pages(max_huge_pages); 211 return 0; 212 } 213 #endif /* CONFIG_SYSCTL */ 214 215 int hugetlb_report_meminfo(char *buf) 216 { 217 return sprintf(buf, 218 "HugePages_Total: %5lu\n" 219 "HugePages_Free: %5lu\n" 220 "Hugepagesize: %5lu kB\n", 221 nr_huge_pages, 222 free_huge_pages, 223 HPAGE_SIZE/1024); 224 } 225 226 int hugetlb_report_node_meminfo(int nid, char *buf) 227 { 228 return sprintf(buf, 229 "Node %d HugePages_Total: %5u\n" 230 "Node %d HugePages_Free: %5u\n", 231 nid, nr_huge_pages_node[nid], 232 nid, free_huge_pages_node[nid]); 233 } 234 235 int is_hugepage_mem_enough(size_t size) 236 { 237 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; 238 } 239 240 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 241 unsigned long hugetlb_total_pages(void) 242 { 243 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 244 } 245 246 /* 247 * We cannot handle pagefaults against hugetlb pages at all. They cause 248 * handle_mm_fault() to try to instantiate regular-sized pages in the 249 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 250 * this far. 251 */ 252 static struct page *hugetlb_nopage(struct vm_area_struct *vma, 253 unsigned long address, int *unused) 254 { 255 BUG(); 256 return NULL; 257 } 258 259 struct vm_operations_struct hugetlb_vm_ops = { 260 .nopage = hugetlb_nopage, 261 }; 262 263 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 264 int writable) 265 { 266 pte_t entry; 267 268 if (writable) { 269 entry = 270 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 271 } else { 272 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 273 } 274 entry = pte_mkyoung(entry); 275 entry = pte_mkhuge(entry); 276 277 return entry; 278 } 279 280 static void set_huge_ptep_writable(struct vm_area_struct *vma, 281 unsigned long address, pte_t *ptep) 282 { 283 pte_t entry; 284 285 entry = pte_mkwrite(pte_mkdirty(*ptep)); 286 ptep_set_access_flags(vma, address, ptep, entry, 1); 287 update_mmu_cache(vma, address, entry); 288 lazy_mmu_prot_update(entry); 289 } 290 291 292 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 293 struct vm_area_struct *vma) 294 { 295 pte_t *src_pte, *dst_pte, entry; 296 struct page *ptepage; 297 unsigned long addr; 298 int cow; 299 300 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 301 302 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 303 src_pte = huge_pte_offset(src, addr); 304 if (!src_pte) 305 continue; 306 dst_pte = huge_pte_alloc(dst, addr); 307 if (!dst_pte) 308 goto nomem; 309 spin_lock(&dst->page_table_lock); 310 spin_lock(&src->page_table_lock); 311 if (!pte_none(*src_pte)) { 312 if (cow) 313 ptep_set_wrprotect(src, addr, src_pte); 314 entry = *src_pte; 315 ptepage = pte_page(entry); 316 get_page(ptepage); 317 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); 318 set_huge_pte_at(dst, addr, dst_pte, entry); 319 } 320 spin_unlock(&src->page_table_lock); 321 spin_unlock(&dst->page_table_lock); 322 } 323 return 0; 324 325 nomem: 326 return -ENOMEM; 327 } 328 329 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 330 unsigned long end) 331 { 332 struct mm_struct *mm = vma->vm_mm; 333 unsigned long address; 334 pte_t *ptep; 335 pte_t pte; 336 struct page *page; 337 338 WARN_ON(!is_vm_hugetlb_page(vma)); 339 BUG_ON(start & ~HPAGE_MASK); 340 BUG_ON(end & ~HPAGE_MASK); 341 342 spin_lock(&mm->page_table_lock); 343 344 /* Update high watermark before we lower rss */ 345 update_hiwater_rss(mm); 346 347 for (address = start; address < end; address += HPAGE_SIZE) { 348 ptep = huge_pte_offset(mm, address); 349 if (!ptep) 350 continue; 351 352 pte = huge_ptep_get_and_clear(mm, address, ptep); 353 if (pte_none(pte)) 354 continue; 355 356 page = pte_page(pte); 357 put_page(page); 358 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); 359 } 360 361 spin_unlock(&mm->page_table_lock); 362 flush_tlb_range(vma, start, end); 363 } 364 365 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 366 unsigned long address, pte_t *ptep, pte_t pte) 367 { 368 struct page *old_page, *new_page; 369 int i, avoidcopy; 370 371 old_page = pte_page(pte); 372 373 /* If no-one else is actually using this page, avoid the copy 374 * and just make the page writable */ 375 avoidcopy = (page_count(old_page) == 1); 376 if (avoidcopy) { 377 set_huge_ptep_writable(vma, address, ptep); 378 return VM_FAULT_MINOR; 379 } 380 381 page_cache_get(old_page); 382 new_page = alloc_huge_page(vma, address); 383 384 if (!new_page) { 385 page_cache_release(old_page); 386 return VM_FAULT_OOM; 387 } 388 389 spin_unlock(&mm->page_table_lock); 390 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) 391 copy_user_highpage(new_page + i, old_page + i, 392 address + i*PAGE_SIZE); 393 spin_lock(&mm->page_table_lock); 394 395 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 396 if (likely(pte_same(*ptep, pte))) { 397 /* Break COW */ 398 set_huge_pte_at(mm, address, ptep, 399 make_huge_pte(vma, new_page, 1)); 400 /* Make the old page be freed below */ 401 new_page = old_page; 402 } 403 page_cache_release(new_page); 404 page_cache_release(old_page); 405 return VM_FAULT_MINOR; 406 } 407 408 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 409 unsigned long address, pte_t *ptep, int write_access) 410 { 411 int ret = VM_FAULT_SIGBUS; 412 unsigned long idx; 413 unsigned long size; 414 struct page *page; 415 struct address_space *mapping; 416 pte_t new_pte; 417 418 mapping = vma->vm_file->f_mapping; 419 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 420 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 421 422 /* 423 * Use page lock to guard against racing truncation 424 * before we get page_table_lock. 425 */ 426 retry: 427 page = find_lock_page(mapping, idx); 428 if (!page) { 429 if (hugetlb_get_quota(mapping)) 430 goto out; 431 page = alloc_huge_page(vma, address); 432 if (!page) { 433 hugetlb_put_quota(mapping); 434 ret = VM_FAULT_OOM; 435 goto out; 436 } 437 438 if (vma->vm_flags & VM_SHARED) { 439 int err; 440 441 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 442 if (err) { 443 put_page(page); 444 hugetlb_put_quota(mapping); 445 if (err == -EEXIST) 446 goto retry; 447 goto out; 448 } 449 } else 450 lock_page(page); 451 } 452 453 spin_lock(&mm->page_table_lock); 454 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 455 if (idx >= size) 456 goto backout; 457 458 ret = VM_FAULT_MINOR; 459 if (!pte_none(*ptep)) 460 goto backout; 461 462 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 463 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 464 && (vma->vm_flags & VM_SHARED))); 465 set_huge_pte_at(mm, address, ptep, new_pte); 466 467 if (write_access && !(vma->vm_flags & VM_SHARED)) { 468 /* Optimization, do the COW without a second fault */ 469 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 470 } 471 472 spin_unlock(&mm->page_table_lock); 473 unlock_page(page); 474 out: 475 return ret; 476 477 backout: 478 spin_unlock(&mm->page_table_lock); 479 hugetlb_put_quota(mapping); 480 unlock_page(page); 481 put_page(page); 482 goto out; 483 } 484 485 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 486 unsigned long address, int write_access) 487 { 488 pte_t *ptep; 489 pte_t entry; 490 int ret; 491 492 ptep = huge_pte_alloc(mm, address); 493 if (!ptep) 494 return VM_FAULT_OOM; 495 496 entry = *ptep; 497 if (pte_none(entry)) 498 return hugetlb_no_page(mm, vma, address, ptep, write_access); 499 500 ret = VM_FAULT_MINOR; 501 502 spin_lock(&mm->page_table_lock); 503 /* Check for a racing update before calling hugetlb_cow */ 504 if (likely(pte_same(entry, *ptep))) 505 if (write_access && !pte_write(entry)) 506 ret = hugetlb_cow(mm, vma, address, ptep, entry); 507 spin_unlock(&mm->page_table_lock); 508 509 return ret; 510 } 511 512 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 513 struct page **pages, struct vm_area_struct **vmas, 514 unsigned long *position, int *length, int i) 515 { 516 unsigned long vpfn, vaddr = *position; 517 int remainder = *length; 518 519 vpfn = vaddr/PAGE_SIZE; 520 spin_lock(&mm->page_table_lock); 521 while (vaddr < vma->vm_end && remainder) { 522 pte_t *pte; 523 struct page *page; 524 525 /* 526 * Some archs (sparc64, sh*) have multiple pte_ts to 527 * each hugepage. We have to make * sure we get the 528 * first, for the page indexing below to work. 529 */ 530 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 531 532 if (!pte || pte_none(*pte)) { 533 int ret; 534 535 spin_unlock(&mm->page_table_lock); 536 ret = hugetlb_fault(mm, vma, vaddr, 0); 537 spin_lock(&mm->page_table_lock); 538 if (ret == VM_FAULT_MINOR) 539 continue; 540 541 remainder = 0; 542 if (!i) 543 i = -EFAULT; 544 break; 545 } 546 547 if (pages) { 548 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 549 get_page(page); 550 pages[i] = page; 551 } 552 553 if (vmas) 554 vmas[i] = vma; 555 556 vaddr += PAGE_SIZE; 557 ++vpfn; 558 --remainder; 559 ++i; 560 } 561 spin_unlock(&mm->page_table_lock); 562 *length = remainder; 563 *position = vaddr; 564 565 return i; 566 } 567