1 /* 2 * hugetlbpage-backed filesystem. Based on ramfs. 3 * 4 * William Irwin, 2002 5 * 6 * Copyright (C) 2002 Linus Torvalds. 7 */ 8 9 #include <linux/module.h> 10 #include <linux/thread_info.h> 11 #include <asm/current.h> 12 #include <linux/sched.h> /* remove ASAP */ 13 #include <linux/fs.h> 14 #include <linux/mount.h> 15 #include <linux/file.h> 16 #include <linux/kernel.h> 17 #include <linux/writeback.h> 18 #include <linux/pagemap.h> 19 #include <linux/highmem.h> 20 #include <linux/init.h> 21 #include <linux/string.h> 22 #include <linux/capability.h> 23 #include <linux/ctype.h> 24 #include <linux/backing-dev.h> 25 #include <linux/hugetlb.h> 26 #include <linux/pagevec.h> 27 #include <linux/parser.h> 28 #include <linux/mman.h> 29 #include <linux/quotaops.h> 30 #include <linux/slab.h> 31 #include <linux/dnotify.h> 32 #include <linux/statfs.h> 33 #include <linux/security.h> 34 35 #include <asm/uaccess.h> 36 37 /* some random number */ 38 #define HUGETLBFS_MAGIC 0x958458f6 39 40 static const struct super_operations hugetlbfs_ops; 41 static const struct address_space_operations hugetlbfs_aops; 42 const struct file_operations hugetlbfs_file_operations; 43 static const struct inode_operations hugetlbfs_dir_inode_operations; 44 static const struct inode_operations hugetlbfs_inode_operations; 45 46 static struct backing_dev_info hugetlbfs_backing_dev_info = { 47 .ra_pages = 0, /* No readahead */ 48 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 49 }; 50 51 int sysctl_hugetlb_shm_group; 52 53 enum { 54 Opt_size, Opt_nr_inodes, 55 Opt_mode, Opt_uid, Opt_gid, 56 Opt_err, 57 }; 58 59 static match_table_t tokens = { 60 {Opt_size, "size=%s"}, 61 {Opt_nr_inodes, "nr_inodes=%s"}, 62 {Opt_mode, "mode=%o"}, 63 {Opt_uid, "uid=%u"}, 64 {Opt_gid, "gid=%u"}, 65 {Opt_err, NULL}, 66 }; 67 68 static void huge_pagevec_release(struct pagevec *pvec) 69 { 70 int i; 71 72 for (i = 0; i < pagevec_count(pvec); ++i) 73 put_page(pvec->pages[i]); 74 75 pagevec_reinit(pvec); 76 } 77 78 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 79 { 80 struct inode *inode = file->f_path.dentry->d_inode; 81 loff_t len, vma_len; 82 int ret; 83 84 /* 85 * vma address alignment (but not the pgoff alignment) has 86 * already been checked by prepare_hugepage_range. If you add 87 * any error returns here, do so after setting VM_HUGETLB, so 88 * is_vm_hugetlb_page tests below unmap_region go the right 89 * way when do_mmap_pgoff unwinds (may be important on powerpc 90 * and ia64). 91 */ 92 vma->vm_flags |= VM_HUGETLB | VM_RESERVED; 93 vma->vm_ops = &hugetlb_vm_ops; 94 95 if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT)) 96 return -EINVAL; 97 98 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 99 100 mutex_lock(&inode->i_mutex); 101 file_accessed(file); 102 103 ret = -ENOMEM; 104 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 105 106 if (vma->vm_flags & VM_MAYSHARE && 107 hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), 108 len >> HPAGE_SHIFT)) 109 goto out; 110 111 ret = 0; 112 hugetlb_prefault_arch_hook(vma->vm_mm); 113 if (vma->vm_flags & VM_WRITE && inode->i_size < len) 114 inode->i_size = len; 115 out: 116 mutex_unlock(&inode->i_mutex); 117 118 return ret; 119 } 120 121 /* 122 * Called under down_write(mmap_sem). 123 */ 124 125 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 126 static unsigned long 127 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 128 unsigned long len, unsigned long pgoff, unsigned long flags) 129 { 130 struct mm_struct *mm = current->mm; 131 struct vm_area_struct *vma; 132 unsigned long start_addr; 133 134 if (len & ~HPAGE_MASK) 135 return -EINVAL; 136 if (len > TASK_SIZE) 137 return -ENOMEM; 138 139 if (flags & MAP_FIXED) { 140 if (prepare_hugepage_range(addr, len)) 141 return -EINVAL; 142 return addr; 143 } 144 145 if (addr) { 146 addr = ALIGN(addr, HPAGE_SIZE); 147 vma = find_vma(mm, addr); 148 if (TASK_SIZE - len >= addr && 149 (!vma || addr + len <= vma->vm_start)) 150 return addr; 151 } 152 153 start_addr = mm->free_area_cache; 154 155 if (len <= mm->cached_hole_size) 156 start_addr = TASK_UNMAPPED_BASE; 157 158 full_search: 159 addr = ALIGN(start_addr, HPAGE_SIZE); 160 161 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 162 /* At this point: (!vma || addr < vma->vm_end). */ 163 if (TASK_SIZE - len < addr) { 164 /* 165 * Start a new search - just in case we missed 166 * some holes. 167 */ 168 if (start_addr != TASK_UNMAPPED_BASE) { 169 start_addr = TASK_UNMAPPED_BASE; 170 goto full_search; 171 } 172 return -ENOMEM; 173 } 174 175 if (!vma || addr + len <= vma->vm_start) 176 return addr; 177 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 178 } 179 } 180 #endif 181 182 static int 183 hugetlbfs_read_actor(struct page *page, unsigned long offset, 184 char __user *buf, unsigned long count, 185 unsigned long size) 186 { 187 char *kaddr; 188 unsigned long left, copied = 0; 189 int i, chunksize; 190 191 if (size > count) 192 size = count; 193 194 /* Find which 4k chunk and offset with in that chunk */ 195 i = offset >> PAGE_CACHE_SHIFT; 196 offset = offset & ~PAGE_CACHE_MASK; 197 198 while (size) { 199 chunksize = PAGE_CACHE_SIZE; 200 if (offset) 201 chunksize -= offset; 202 if (chunksize > size) 203 chunksize = size; 204 kaddr = kmap(&page[i]); 205 left = __copy_to_user(buf, kaddr + offset, chunksize); 206 kunmap(&page[i]); 207 if (left) { 208 copied += (chunksize - left); 209 break; 210 } 211 offset = 0; 212 size -= chunksize; 213 buf += chunksize; 214 copied += chunksize; 215 i++; 216 } 217 return copied ? copied : -EFAULT; 218 } 219 220 /* 221 * Support for read() - Find the page attached to f_mapping and copy out the 222 * data. Its *very* similar to do_generic_mapping_read(), we can't use that 223 * since it has PAGE_CACHE_SIZE assumptions. 224 */ 225 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, 226 size_t len, loff_t *ppos) 227 { 228 struct address_space *mapping = filp->f_mapping; 229 struct inode *inode = mapping->host; 230 unsigned long index = *ppos >> HPAGE_SHIFT; 231 unsigned long offset = *ppos & ~HPAGE_MASK; 232 unsigned long end_index; 233 loff_t isize; 234 ssize_t retval = 0; 235 236 mutex_lock(&inode->i_mutex); 237 238 /* validate length */ 239 if (len == 0) 240 goto out; 241 242 isize = i_size_read(inode); 243 if (!isize) 244 goto out; 245 246 end_index = (isize - 1) >> HPAGE_SHIFT; 247 for (;;) { 248 struct page *page; 249 int nr, ret; 250 251 /* nr is the maximum number of bytes to copy from this page */ 252 nr = HPAGE_SIZE; 253 if (index >= end_index) { 254 if (index > end_index) 255 goto out; 256 nr = ((isize - 1) & ~HPAGE_MASK) + 1; 257 if (nr <= offset) { 258 goto out; 259 } 260 } 261 nr = nr - offset; 262 263 /* Find the page */ 264 page = find_get_page(mapping, index); 265 if (unlikely(page == NULL)) { 266 /* 267 * We have a HOLE, zero out the user-buffer for the 268 * length of the hole or request. 269 */ 270 ret = len < nr ? len : nr; 271 if (clear_user(buf, ret)) 272 ret = -EFAULT; 273 } else { 274 /* 275 * We have the page, copy it to user space buffer. 276 */ 277 ret = hugetlbfs_read_actor(page, offset, buf, len, nr); 278 } 279 if (ret < 0) { 280 if (retval == 0) 281 retval = ret; 282 if (page) 283 page_cache_release(page); 284 goto out; 285 } 286 287 offset += ret; 288 retval += ret; 289 len -= ret; 290 index += offset >> HPAGE_SHIFT; 291 offset &= ~HPAGE_MASK; 292 293 if (page) 294 page_cache_release(page); 295 296 /* short read or no more work */ 297 if ((ret != nr) || (len == 0)) 298 break; 299 } 300 out: 301 *ppos = ((loff_t)index << HPAGE_SHIFT) + offset; 302 mutex_unlock(&inode->i_mutex); 303 return retval; 304 } 305 306 /* 307 * Read a page. Again trivial. If it didn't already exist 308 * in the page cache, it is zero-filled. 309 */ 310 static int hugetlbfs_readpage(struct file *file, struct page * page) 311 { 312 unlock_page(page); 313 return -EINVAL; 314 } 315 316 static int hugetlbfs_write_begin(struct file *file, 317 struct address_space *mapping, 318 loff_t pos, unsigned len, unsigned flags, 319 struct page **pagep, void **fsdata) 320 { 321 return -EINVAL; 322 } 323 324 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, 325 loff_t pos, unsigned len, unsigned copied, 326 struct page *page, void *fsdata) 327 { 328 BUG(); 329 return -EINVAL; 330 } 331 332 static void truncate_huge_page(struct page *page) 333 { 334 cancel_dirty_page(page, /* No IO accounting for huge pages? */0); 335 ClearPageUptodate(page); 336 remove_from_page_cache(page); 337 put_page(page); 338 } 339 340 static void truncate_hugepages(struct inode *inode, loff_t lstart) 341 { 342 struct address_space *mapping = &inode->i_data; 343 const pgoff_t start = lstart >> HPAGE_SHIFT; 344 struct pagevec pvec; 345 pgoff_t next; 346 int i, freed = 0; 347 348 pagevec_init(&pvec, 0); 349 next = start; 350 while (1) { 351 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 352 if (next == start) 353 break; 354 next = start; 355 continue; 356 } 357 358 for (i = 0; i < pagevec_count(&pvec); ++i) { 359 struct page *page = pvec.pages[i]; 360 361 lock_page(page); 362 if (page->index > next) 363 next = page->index; 364 ++next; 365 truncate_huge_page(page); 366 unlock_page(page); 367 freed++; 368 } 369 huge_pagevec_release(&pvec); 370 } 371 BUG_ON(!lstart && mapping->nrpages); 372 hugetlb_unreserve_pages(inode, start, freed); 373 } 374 375 static void hugetlbfs_delete_inode(struct inode *inode) 376 { 377 truncate_hugepages(inode, 0); 378 clear_inode(inode); 379 } 380 381 static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) 382 { 383 struct super_block *sb = inode->i_sb; 384 385 if (!hlist_unhashed(&inode->i_hash)) { 386 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 387 list_move(&inode->i_list, &inode_unused); 388 inodes_stat.nr_unused++; 389 if (!sb || (sb->s_flags & MS_ACTIVE)) { 390 spin_unlock(&inode_lock); 391 return; 392 } 393 inode->i_state |= I_WILL_FREE; 394 spin_unlock(&inode_lock); 395 /* 396 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK 397 * in our backing_dev_info. 398 */ 399 write_inode_now(inode, 1); 400 spin_lock(&inode_lock); 401 inode->i_state &= ~I_WILL_FREE; 402 inodes_stat.nr_unused--; 403 hlist_del_init(&inode->i_hash); 404 } 405 list_del_init(&inode->i_list); 406 list_del_init(&inode->i_sb_list); 407 inode->i_state |= I_FREEING; 408 inodes_stat.nr_inodes--; 409 spin_unlock(&inode_lock); 410 truncate_hugepages(inode, 0); 411 clear_inode(inode); 412 destroy_inode(inode); 413 } 414 415 static void hugetlbfs_drop_inode(struct inode *inode) 416 { 417 if (!inode->i_nlink) 418 generic_delete_inode(inode); 419 else 420 hugetlbfs_forget_inode(inode); 421 } 422 423 static inline void 424 hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) 425 { 426 struct vm_area_struct *vma; 427 struct prio_tree_iter iter; 428 429 vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { 430 unsigned long v_offset; 431 432 /* 433 * Can the expression below overflow on 32-bit arches? 434 * No, because the prio_tree returns us only those vmas 435 * which overlap the truncated area starting at pgoff, 436 * and no vma on a 32-bit arch can span beyond the 4GB. 437 */ 438 if (vma->vm_pgoff < pgoff) 439 v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; 440 else 441 v_offset = 0; 442 443 __unmap_hugepage_range(vma, 444 vma->vm_start + v_offset, vma->vm_end); 445 } 446 } 447 448 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 449 { 450 pgoff_t pgoff; 451 struct address_space *mapping = inode->i_mapping; 452 453 BUG_ON(offset & ~HPAGE_MASK); 454 pgoff = offset >> PAGE_SHIFT; 455 456 i_size_write(inode, offset); 457 spin_lock(&mapping->i_mmap_lock); 458 if (!prio_tree_empty(&mapping->i_mmap)) 459 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 460 spin_unlock(&mapping->i_mmap_lock); 461 truncate_hugepages(inode, offset); 462 return 0; 463 } 464 465 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) 466 { 467 struct inode *inode = dentry->d_inode; 468 int error; 469 unsigned int ia_valid = attr->ia_valid; 470 471 BUG_ON(!inode); 472 473 error = inode_change_ok(inode, attr); 474 if (error) 475 goto out; 476 477 if (ia_valid & ATTR_SIZE) { 478 error = -EINVAL; 479 if (!(attr->ia_size & ~HPAGE_MASK)) 480 error = hugetlb_vmtruncate(inode, attr->ia_size); 481 if (error) 482 goto out; 483 attr->ia_valid &= ~ATTR_SIZE; 484 } 485 error = inode_setattr(inode, attr); 486 out: 487 return error; 488 } 489 490 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 491 gid_t gid, int mode, dev_t dev) 492 { 493 struct inode *inode; 494 495 inode = new_inode(sb); 496 if (inode) { 497 struct hugetlbfs_inode_info *info; 498 inode->i_mode = mode; 499 inode->i_uid = uid; 500 inode->i_gid = gid; 501 inode->i_blocks = 0; 502 inode->i_mapping->a_ops = &hugetlbfs_aops; 503 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 504 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 505 INIT_LIST_HEAD(&inode->i_mapping->private_list); 506 info = HUGETLBFS_I(inode); 507 mpol_shared_policy_init(&info->policy, NULL); 508 switch (mode & S_IFMT) { 509 default: 510 init_special_inode(inode, mode, dev); 511 break; 512 case S_IFREG: 513 inode->i_op = &hugetlbfs_inode_operations; 514 inode->i_fop = &hugetlbfs_file_operations; 515 break; 516 case S_IFDIR: 517 inode->i_op = &hugetlbfs_dir_inode_operations; 518 inode->i_fop = &simple_dir_operations; 519 520 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 521 inc_nlink(inode); 522 break; 523 case S_IFLNK: 524 inode->i_op = &page_symlink_inode_operations; 525 break; 526 } 527 } 528 return inode; 529 } 530 531 /* 532 * File creation. Allocate an inode, and we're done.. 533 */ 534 static int hugetlbfs_mknod(struct inode *dir, 535 struct dentry *dentry, int mode, dev_t dev) 536 { 537 struct inode *inode; 538 int error = -ENOSPC; 539 gid_t gid; 540 541 if (dir->i_mode & S_ISGID) { 542 gid = dir->i_gid; 543 if (S_ISDIR(mode)) 544 mode |= S_ISGID; 545 } else { 546 gid = current->fsgid; 547 } 548 inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, gid, mode, dev); 549 if (inode) { 550 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 551 d_instantiate(dentry, inode); 552 dget(dentry); /* Extra count - pin the dentry in core */ 553 error = 0; 554 } 555 return error; 556 } 557 558 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 559 { 560 int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); 561 if (!retval) 562 inc_nlink(dir); 563 return retval; 564 } 565 566 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) 567 { 568 return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); 569 } 570 571 static int hugetlbfs_symlink(struct inode *dir, 572 struct dentry *dentry, const char *symname) 573 { 574 struct inode *inode; 575 int error = -ENOSPC; 576 gid_t gid; 577 578 if (dir->i_mode & S_ISGID) 579 gid = dir->i_gid; 580 else 581 gid = current->fsgid; 582 583 inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, 584 gid, S_IFLNK|S_IRWXUGO, 0); 585 if (inode) { 586 int l = strlen(symname)+1; 587 error = page_symlink(inode, symname, l); 588 if (!error) { 589 d_instantiate(dentry, inode); 590 dget(dentry); 591 } else 592 iput(inode); 593 } 594 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 595 596 return error; 597 } 598 599 /* 600 * mark the head page dirty 601 */ 602 static int hugetlbfs_set_page_dirty(struct page *page) 603 { 604 struct page *head = compound_head(page); 605 606 SetPageDirty(head); 607 return 0; 608 } 609 610 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 611 { 612 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 613 614 buf->f_type = HUGETLBFS_MAGIC; 615 buf->f_bsize = HPAGE_SIZE; 616 if (sbinfo) { 617 spin_lock(&sbinfo->stat_lock); 618 /* If no limits set, just report 0 for max/free/used 619 * blocks, like simple_statfs() */ 620 if (sbinfo->max_blocks >= 0) { 621 buf->f_blocks = sbinfo->max_blocks; 622 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 623 buf->f_files = sbinfo->max_inodes; 624 buf->f_ffree = sbinfo->free_inodes; 625 } 626 spin_unlock(&sbinfo->stat_lock); 627 } 628 buf->f_namelen = NAME_MAX; 629 return 0; 630 } 631 632 static void hugetlbfs_put_super(struct super_block *sb) 633 { 634 struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); 635 636 if (sbi) { 637 sb->s_fs_info = NULL; 638 kfree(sbi); 639 } 640 } 641 642 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) 643 { 644 if (sbinfo->free_inodes >= 0) { 645 spin_lock(&sbinfo->stat_lock); 646 if (unlikely(!sbinfo->free_inodes)) { 647 spin_unlock(&sbinfo->stat_lock); 648 return 0; 649 } 650 sbinfo->free_inodes--; 651 spin_unlock(&sbinfo->stat_lock); 652 } 653 654 return 1; 655 } 656 657 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) 658 { 659 if (sbinfo->free_inodes >= 0) { 660 spin_lock(&sbinfo->stat_lock); 661 sbinfo->free_inodes++; 662 spin_unlock(&sbinfo->stat_lock); 663 } 664 } 665 666 667 static struct kmem_cache *hugetlbfs_inode_cachep; 668 669 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 670 { 671 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); 672 struct hugetlbfs_inode_info *p; 673 674 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) 675 return NULL; 676 p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); 677 if (unlikely(!p)) { 678 hugetlbfs_inc_free_inodes(sbinfo); 679 return NULL; 680 } 681 return &p->vfs_inode; 682 } 683 684 static void hugetlbfs_destroy_inode(struct inode *inode) 685 { 686 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 687 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 688 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 689 } 690 691 static const struct address_space_operations hugetlbfs_aops = { 692 .readpage = hugetlbfs_readpage, 693 .write_begin = hugetlbfs_write_begin, 694 .write_end = hugetlbfs_write_end, 695 .set_page_dirty = hugetlbfs_set_page_dirty, 696 }; 697 698 699 static void init_once(struct kmem_cache *cachep, void *foo) 700 { 701 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; 702 703 inode_init_once(&ei->vfs_inode); 704 } 705 706 const struct file_operations hugetlbfs_file_operations = { 707 .read = hugetlbfs_read, 708 .mmap = hugetlbfs_file_mmap, 709 .fsync = simple_sync_file, 710 .get_unmapped_area = hugetlb_get_unmapped_area, 711 }; 712 713 static const struct inode_operations hugetlbfs_dir_inode_operations = { 714 .create = hugetlbfs_create, 715 .lookup = simple_lookup, 716 .link = simple_link, 717 .unlink = simple_unlink, 718 .symlink = hugetlbfs_symlink, 719 .mkdir = hugetlbfs_mkdir, 720 .rmdir = simple_rmdir, 721 .mknod = hugetlbfs_mknod, 722 .rename = simple_rename, 723 .setattr = hugetlbfs_setattr, 724 }; 725 726 static const struct inode_operations hugetlbfs_inode_operations = { 727 .setattr = hugetlbfs_setattr, 728 }; 729 730 static const struct super_operations hugetlbfs_ops = { 731 .alloc_inode = hugetlbfs_alloc_inode, 732 .destroy_inode = hugetlbfs_destroy_inode, 733 .statfs = hugetlbfs_statfs, 734 .delete_inode = hugetlbfs_delete_inode, 735 .drop_inode = hugetlbfs_drop_inode, 736 .put_super = hugetlbfs_put_super, 737 .show_options = generic_show_options, 738 }; 739 740 static int 741 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 742 { 743 char *p, *rest; 744 substring_t args[MAX_OPT_ARGS]; 745 int option; 746 747 if (!options) 748 return 0; 749 750 while ((p = strsep(&options, ",")) != NULL) { 751 int token; 752 if (!*p) 753 continue; 754 755 token = match_token(p, tokens, args); 756 switch (token) { 757 case Opt_uid: 758 if (match_int(&args[0], &option)) 759 goto bad_val; 760 pconfig->uid = option; 761 break; 762 763 case Opt_gid: 764 if (match_int(&args[0], &option)) 765 goto bad_val; 766 pconfig->gid = option; 767 break; 768 769 case Opt_mode: 770 if (match_octal(&args[0], &option)) 771 goto bad_val; 772 pconfig->mode = option & 01777U; 773 break; 774 775 case Opt_size: { 776 unsigned long long size; 777 /* memparse() will accept a K/M/G without a digit */ 778 if (!isdigit(*args[0].from)) 779 goto bad_val; 780 size = memparse(args[0].from, &rest); 781 if (*rest == '%') { 782 size <<= HPAGE_SHIFT; 783 size *= max_huge_pages; 784 do_div(size, 100); 785 } 786 pconfig->nr_blocks = (size >> HPAGE_SHIFT); 787 break; 788 } 789 790 case Opt_nr_inodes: 791 /* memparse() will accept a K/M/G without a digit */ 792 if (!isdigit(*args[0].from)) 793 goto bad_val; 794 pconfig->nr_inodes = memparse(args[0].from, &rest); 795 break; 796 797 default: 798 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", 799 p); 800 return -EINVAL; 801 break; 802 } 803 } 804 return 0; 805 806 bad_val: 807 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", 808 args[0].from, p); 809 return 1; 810 } 811 812 static int 813 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) 814 { 815 struct inode * inode; 816 struct dentry * root; 817 int ret; 818 struct hugetlbfs_config config; 819 struct hugetlbfs_sb_info *sbinfo; 820 821 save_mount_options(sb, data); 822 823 config.nr_blocks = -1; /* No limit on size by default */ 824 config.nr_inodes = -1; /* No limit on number of inodes by default */ 825 config.uid = current->fsuid; 826 config.gid = current->fsgid; 827 config.mode = 0755; 828 ret = hugetlbfs_parse_options(data, &config); 829 if (ret) 830 return ret; 831 832 sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); 833 if (!sbinfo) 834 return -ENOMEM; 835 sb->s_fs_info = sbinfo; 836 spin_lock_init(&sbinfo->stat_lock); 837 sbinfo->max_blocks = config.nr_blocks; 838 sbinfo->free_blocks = config.nr_blocks; 839 sbinfo->max_inodes = config.nr_inodes; 840 sbinfo->free_inodes = config.nr_inodes; 841 sb->s_maxbytes = MAX_LFS_FILESIZE; 842 sb->s_blocksize = HPAGE_SIZE; 843 sb->s_blocksize_bits = HPAGE_SHIFT; 844 sb->s_magic = HUGETLBFS_MAGIC; 845 sb->s_op = &hugetlbfs_ops; 846 sb->s_time_gran = 1; 847 inode = hugetlbfs_get_inode(sb, config.uid, config.gid, 848 S_IFDIR | config.mode, 0); 849 if (!inode) 850 goto out_free; 851 852 root = d_alloc_root(inode); 853 if (!root) { 854 iput(inode); 855 goto out_free; 856 } 857 sb->s_root = root; 858 return 0; 859 out_free: 860 kfree(sbinfo); 861 return -ENOMEM; 862 } 863 864 int hugetlb_get_quota(struct address_space *mapping, long delta) 865 { 866 int ret = 0; 867 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); 868 869 if (sbinfo->free_blocks > -1) { 870 spin_lock(&sbinfo->stat_lock); 871 if (sbinfo->free_blocks - delta >= 0) 872 sbinfo->free_blocks -= delta; 873 else 874 ret = -ENOMEM; 875 spin_unlock(&sbinfo->stat_lock); 876 } 877 878 return ret; 879 } 880 881 void hugetlb_put_quota(struct address_space *mapping, long delta) 882 { 883 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); 884 885 if (sbinfo->free_blocks > -1) { 886 spin_lock(&sbinfo->stat_lock); 887 sbinfo->free_blocks += delta; 888 spin_unlock(&sbinfo->stat_lock); 889 } 890 } 891 892 static int hugetlbfs_get_sb(struct file_system_type *fs_type, 893 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 894 { 895 return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); 896 } 897 898 static struct file_system_type hugetlbfs_fs_type = { 899 .name = "hugetlbfs", 900 .get_sb = hugetlbfs_get_sb, 901 .kill_sb = kill_litter_super, 902 }; 903 904 static struct vfsmount *hugetlbfs_vfsmount; 905 906 static int can_do_hugetlb_shm(void) 907 { 908 return likely(capable(CAP_IPC_LOCK) || 909 in_group_p(sysctl_hugetlb_shm_group) || 910 can_do_mlock()); 911 } 912 913 struct file *hugetlb_file_setup(const char *name, size_t size) 914 { 915 int error = -ENOMEM; 916 struct file *file; 917 struct inode *inode; 918 struct dentry *dentry, *root; 919 struct qstr quick_string; 920 921 if (!hugetlbfs_vfsmount) 922 return ERR_PTR(-ENOENT); 923 924 if (!can_do_hugetlb_shm()) 925 return ERR_PTR(-EPERM); 926 927 if (!user_shm_lock(size, current->user)) 928 return ERR_PTR(-ENOMEM); 929 930 root = hugetlbfs_vfsmount->mnt_root; 931 quick_string.name = name; 932 quick_string.len = strlen(quick_string.name); 933 quick_string.hash = 0; 934 dentry = d_alloc(root, &quick_string); 935 if (!dentry) 936 goto out_shm_unlock; 937 938 error = -ENOSPC; 939 inode = hugetlbfs_get_inode(root->d_sb, current->fsuid, 940 current->fsgid, S_IFREG | S_IRWXUGO, 0); 941 if (!inode) 942 goto out_dentry; 943 944 error = -ENOMEM; 945 if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) 946 goto out_inode; 947 948 d_instantiate(dentry, inode); 949 inode->i_size = size; 950 inode->i_nlink = 0; 951 952 error = -ENFILE; 953 file = alloc_file(hugetlbfs_vfsmount, dentry, 954 FMODE_WRITE | FMODE_READ, 955 &hugetlbfs_file_operations); 956 if (!file) 957 goto out_dentry; /* inode is already attached */ 958 959 return file; 960 961 out_inode: 962 iput(inode); 963 out_dentry: 964 dput(dentry); 965 out_shm_unlock: 966 user_shm_unlock(size, current->user); 967 return ERR_PTR(error); 968 } 969 970 static int __init init_hugetlbfs_fs(void) 971 { 972 int error; 973 struct vfsmount *vfsmount; 974 975 error = bdi_init(&hugetlbfs_backing_dev_info); 976 if (error) 977 return error; 978 979 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 980 sizeof(struct hugetlbfs_inode_info), 981 0, 0, init_once); 982 if (hugetlbfs_inode_cachep == NULL) 983 goto out2; 984 985 error = register_filesystem(&hugetlbfs_fs_type); 986 if (error) 987 goto out; 988 989 vfsmount = kern_mount(&hugetlbfs_fs_type); 990 991 if (!IS_ERR(vfsmount)) { 992 hugetlbfs_vfsmount = vfsmount; 993 return 0; 994 } 995 996 error = PTR_ERR(vfsmount); 997 998 out: 999 if (error) 1000 kmem_cache_destroy(hugetlbfs_inode_cachep); 1001 out2: 1002 bdi_destroy(&hugetlbfs_backing_dev_info); 1003 return error; 1004 } 1005 1006 static void __exit exit_hugetlbfs_fs(void) 1007 { 1008 kmem_cache_destroy(hugetlbfs_inode_cachep); 1009 unregister_filesystem(&hugetlbfs_fs_type); 1010 bdi_destroy(&hugetlbfs_backing_dev_info); 1011 } 1012 1013 module_init(init_hugetlbfs_fs) 1014 module_exit(exit_hugetlbfs_fs) 1015 1016 MODULE_LICENSE("GPL"); 1017