1 /* 2 * hugetlbpage-backed filesystem. Based on ramfs. 3 * 4 * Nadia Yvette Chambers, 2002 5 * 6 * Copyright (C) 2002 Linus Torvalds. 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/module.h> 12 #include <linux/thread_info.h> 13 #include <asm/current.h> 14 #include <linux/sched.h> /* remove ASAP */ 15 #include <linux/fs.h> 16 #include <linux/mount.h> 17 #include <linux/file.h> 18 #include <linux/kernel.h> 19 #include <linux/writeback.h> 20 #include <linux/pagemap.h> 21 #include <linux/highmem.h> 22 #include <linux/init.h> 23 #include <linux/string.h> 24 #include <linux/capability.h> 25 #include <linux/ctype.h> 26 #include <linux/backing-dev.h> 27 #include <linux/hugetlb.h> 28 #include <linux/pagevec.h> 29 #include <linux/parser.h> 30 #include <linux/mman.h> 31 #include <linux/slab.h> 32 #include <linux/dnotify.h> 33 #include <linux/statfs.h> 34 #include <linux/security.h> 35 #include <linux/magic.h> 36 #include <linux/migrate.h> 37 #include <linux/uio.h> 38 39 #include <asm/uaccess.h> 40 41 static const struct super_operations hugetlbfs_ops; 42 static const struct address_space_operations hugetlbfs_aops; 43 const struct file_operations hugetlbfs_file_operations; 44 static const struct inode_operations hugetlbfs_dir_inode_operations; 45 static const struct inode_operations hugetlbfs_inode_operations; 46 47 struct hugetlbfs_config { 48 kuid_t uid; 49 kgid_t gid; 50 umode_t mode; 51 long max_hpages; 52 long nr_inodes; 53 struct hstate *hstate; 54 long min_hpages; 55 }; 56 57 struct hugetlbfs_inode_info { 58 struct shared_policy policy; 59 struct inode vfs_inode; 60 }; 61 62 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) 63 { 64 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); 65 } 66 67 int sysctl_hugetlb_shm_group; 68 69 enum { 70 Opt_size, Opt_nr_inodes, 71 Opt_mode, Opt_uid, Opt_gid, 72 Opt_pagesize, Opt_min_size, 73 Opt_err, 74 }; 75 76 static const match_table_t tokens = { 77 {Opt_size, "size=%s"}, 78 {Opt_nr_inodes, "nr_inodes=%s"}, 79 {Opt_mode, "mode=%o"}, 80 {Opt_uid, "uid=%u"}, 81 {Opt_gid, "gid=%u"}, 82 {Opt_pagesize, "pagesize=%s"}, 83 {Opt_min_size, "min_size=%s"}, 84 {Opt_err, NULL}, 85 }; 86 87 static void huge_pagevec_release(struct pagevec *pvec) 88 { 89 int i; 90 91 for (i = 0; i < pagevec_count(pvec); ++i) 92 put_page(pvec->pages[i]); 93 94 pagevec_reinit(pvec); 95 } 96 97 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 98 { 99 struct inode *inode = file_inode(file); 100 loff_t len, vma_len; 101 int ret; 102 struct hstate *h = hstate_file(file); 103 104 /* 105 * vma address alignment (but not the pgoff alignment) has 106 * already been checked by prepare_hugepage_range. If you add 107 * any error returns here, do so after setting VM_HUGETLB, so 108 * is_vm_hugetlb_page tests below unmap_region go the right 109 * way when do_mmap_pgoff unwinds (may be important on powerpc 110 * and ia64). 111 */ 112 vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; 113 vma->vm_ops = &hugetlb_vm_ops; 114 115 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 116 return -EINVAL; 117 118 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 119 120 mutex_lock(&inode->i_mutex); 121 file_accessed(file); 122 123 ret = -ENOMEM; 124 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 125 126 if (hugetlb_reserve_pages(inode, 127 vma->vm_pgoff >> huge_page_order(h), 128 len >> huge_page_shift(h), vma, 129 vma->vm_flags)) 130 goto out; 131 132 ret = 0; 133 hugetlb_prefault_arch_hook(vma->vm_mm); 134 if (vma->vm_flags & VM_WRITE && inode->i_size < len) 135 inode->i_size = len; 136 out: 137 mutex_unlock(&inode->i_mutex); 138 139 return ret; 140 } 141 142 /* 143 * Called under down_write(mmap_sem). 144 */ 145 146 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 147 static unsigned long 148 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 149 unsigned long len, unsigned long pgoff, unsigned long flags) 150 { 151 struct mm_struct *mm = current->mm; 152 struct vm_area_struct *vma; 153 struct hstate *h = hstate_file(file); 154 struct vm_unmapped_area_info info; 155 156 if (len & ~huge_page_mask(h)) 157 return -EINVAL; 158 if (len > TASK_SIZE) 159 return -ENOMEM; 160 161 if (flags & MAP_FIXED) { 162 if (prepare_hugepage_range(file, addr, len)) 163 return -EINVAL; 164 return addr; 165 } 166 167 if (addr) { 168 addr = ALIGN(addr, huge_page_size(h)); 169 vma = find_vma(mm, addr); 170 if (TASK_SIZE - len >= addr && 171 (!vma || addr + len <= vma->vm_start)) 172 return addr; 173 } 174 175 info.flags = 0; 176 info.length = len; 177 info.low_limit = TASK_UNMAPPED_BASE; 178 info.high_limit = TASK_SIZE; 179 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 180 info.align_offset = 0; 181 return vm_unmapped_area(&info); 182 } 183 #endif 184 185 static size_t 186 hugetlbfs_read_actor(struct page *page, unsigned long offset, 187 struct iov_iter *to, unsigned long size) 188 { 189 size_t copied = 0; 190 int i, chunksize; 191 192 /* Find which 4k chunk and offset with in that chunk */ 193 i = offset >> PAGE_CACHE_SHIFT; 194 offset = offset & ~PAGE_CACHE_MASK; 195 196 while (size) { 197 size_t n; 198 chunksize = PAGE_CACHE_SIZE; 199 if (offset) 200 chunksize -= offset; 201 if (chunksize > size) 202 chunksize = size; 203 n = copy_page_to_iter(&page[i], offset, chunksize, to); 204 copied += n; 205 if (n != chunksize) 206 return copied; 207 offset = 0; 208 size -= chunksize; 209 i++; 210 } 211 return copied; 212 } 213 214 /* 215 * Support for read() - Find the page attached to f_mapping and copy out the 216 * data. Its *very* similar to do_generic_mapping_read(), we can't use that 217 * since it has PAGE_CACHE_SIZE assumptions. 218 */ 219 static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) 220 { 221 struct file *file = iocb->ki_filp; 222 struct hstate *h = hstate_file(file); 223 struct address_space *mapping = file->f_mapping; 224 struct inode *inode = mapping->host; 225 unsigned long index = iocb->ki_pos >> huge_page_shift(h); 226 unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); 227 unsigned long end_index; 228 loff_t isize; 229 ssize_t retval = 0; 230 231 while (iov_iter_count(to)) { 232 struct page *page; 233 size_t nr, copied; 234 235 /* nr is the maximum number of bytes to copy from this page */ 236 nr = huge_page_size(h); 237 isize = i_size_read(inode); 238 if (!isize) 239 break; 240 end_index = (isize - 1) >> huge_page_shift(h); 241 if (index > end_index) 242 break; 243 if (index == end_index) { 244 nr = ((isize - 1) & ~huge_page_mask(h)) + 1; 245 if (nr <= offset) 246 break; 247 } 248 nr = nr - offset; 249 250 /* Find the page */ 251 page = find_lock_page(mapping, index); 252 if (unlikely(page == NULL)) { 253 /* 254 * We have a HOLE, zero out the user-buffer for the 255 * length of the hole or request. 256 */ 257 copied = iov_iter_zero(nr, to); 258 } else { 259 unlock_page(page); 260 261 /* 262 * We have the page, copy it to user space buffer. 263 */ 264 copied = hugetlbfs_read_actor(page, offset, to, nr); 265 page_cache_release(page); 266 } 267 offset += copied; 268 retval += copied; 269 if (copied != nr && iov_iter_count(to)) { 270 if (!retval) 271 retval = -EFAULT; 272 break; 273 } 274 index += offset >> huge_page_shift(h); 275 offset &= ~huge_page_mask(h); 276 } 277 iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; 278 return retval; 279 } 280 281 static int hugetlbfs_write_begin(struct file *file, 282 struct address_space *mapping, 283 loff_t pos, unsigned len, unsigned flags, 284 struct page **pagep, void **fsdata) 285 { 286 return -EINVAL; 287 } 288 289 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, 290 loff_t pos, unsigned len, unsigned copied, 291 struct page *page, void *fsdata) 292 { 293 BUG(); 294 return -EINVAL; 295 } 296 297 static void truncate_huge_page(struct page *page) 298 { 299 ClearPageDirty(page); 300 ClearPageUptodate(page); 301 delete_from_page_cache(page); 302 } 303 304 static void truncate_hugepages(struct inode *inode, loff_t lstart) 305 { 306 struct hstate *h = hstate_inode(inode); 307 struct address_space *mapping = &inode->i_data; 308 const pgoff_t start = lstart >> huge_page_shift(h); 309 struct pagevec pvec; 310 pgoff_t next; 311 int i, freed = 0; 312 313 pagevec_init(&pvec, 0); 314 next = start; 315 while (1) { 316 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 317 if (next == start) 318 break; 319 next = start; 320 continue; 321 } 322 323 for (i = 0; i < pagevec_count(&pvec); ++i) { 324 struct page *page = pvec.pages[i]; 325 326 lock_page(page); 327 if (page->index > next) 328 next = page->index; 329 ++next; 330 truncate_huge_page(page); 331 unlock_page(page); 332 freed++; 333 } 334 huge_pagevec_release(&pvec); 335 } 336 BUG_ON(!lstart && mapping->nrpages); 337 hugetlb_unreserve_pages(inode, start, freed); 338 } 339 340 static void hugetlbfs_evict_inode(struct inode *inode) 341 { 342 struct resv_map *resv_map; 343 344 truncate_hugepages(inode, 0); 345 resv_map = (struct resv_map *)inode->i_mapping->private_data; 346 /* root inode doesn't have the resv_map, so we should check it */ 347 if (resv_map) 348 resv_map_release(&resv_map->refs); 349 clear_inode(inode); 350 } 351 352 static inline void 353 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) 354 { 355 struct vm_area_struct *vma; 356 357 vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { 358 unsigned long v_offset; 359 360 /* 361 * Can the expression below overflow on 32-bit arches? 362 * No, because the interval tree returns us only those vmas 363 * which overlap the truncated area starting at pgoff, 364 * and no vma on a 32-bit arch can span beyond the 4GB. 365 */ 366 if (vma->vm_pgoff < pgoff) 367 v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; 368 else 369 v_offset = 0; 370 371 unmap_hugepage_range(vma, vma->vm_start + v_offset, 372 vma->vm_end, NULL); 373 } 374 } 375 376 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 377 { 378 pgoff_t pgoff; 379 struct address_space *mapping = inode->i_mapping; 380 struct hstate *h = hstate_inode(inode); 381 382 BUG_ON(offset & ~huge_page_mask(h)); 383 pgoff = offset >> PAGE_SHIFT; 384 385 i_size_write(inode, offset); 386 i_mmap_lock_write(mapping); 387 if (!RB_EMPTY_ROOT(&mapping->i_mmap)) 388 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 389 i_mmap_unlock_write(mapping); 390 truncate_hugepages(inode, offset); 391 return 0; 392 } 393 394 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) 395 { 396 struct inode *inode = d_inode(dentry); 397 struct hstate *h = hstate_inode(inode); 398 int error; 399 unsigned int ia_valid = attr->ia_valid; 400 401 BUG_ON(!inode); 402 403 error = inode_change_ok(inode, attr); 404 if (error) 405 return error; 406 407 if (ia_valid & ATTR_SIZE) { 408 error = -EINVAL; 409 if (attr->ia_size & ~huge_page_mask(h)) 410 return -EINVAL; 411 error = hugetlb_vmtruncate(inode, attr->ia_size); 412 if (error) 413 return error; 414 } 415 416 setattr_copy(inode, attr); 417 mark_inode_dirty(inode); 418 return 0; 419 } 420 421 static struct inode *hugetlbfs_get_root(struct super_block *sb, 422 struct hugetlbfs_config *config) 423 { 424 struct inode *inode; 425 426 inode = new_inode(sb); 427 if (inode) { 428 struct hugetlbfs_inode_info *info; 429 inode->i_ino = get_next_ino(); 430 inode->i_mode = S_IFDIR | config->mode; 431 inode->i_uid = config->uid; 432 inode->i_gid = config->gid; 433 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 434 info = HUGETLBFS_I(inode); 435 mpol_shared_policy_init(&info->policy, NULL); 436 inode->i_op = &hugetlbfs_dir_inode_operations; 437 inode->i_fop = &simple_dir_operations; 438 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 439 inc_nlink(inode); 440 lockdep_annotate_inode_mutex_key(inode); 441 } 442 return inode; 443 } 444 445 /* 446 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never 447 * be taken from reclaim -- unlike regular filesystems. This needs an 448 * annotation because huge_pmd_share() does an allocation under 449 * i_mmap_rwsem. 450 */ 451 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; 452 453 static struct inode *hugetlbfs_get_inode(struct super_block *sb, 454 struct inode *dir, 455 umode_t mode, dev_t dev) 456 { 457 struct inode *inode; 458 struct resv_map *resv_map; 459 460 resv_map = resv_map_alloc(); 461 if (!resv_map) 462 return NULL; 463 464 inode = new_inode(sb); 465 if (inode) { 466 struct hugetlbfs_inode_info *info; 467 inode->i_ino = get_next_ino(); 468 inode_init_owner(inode, dir, mode); 469 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, 470 &hugetlbfs_i_mmap_rwsem_key); 471 inode->i_mapping->a_ops = &hugetlbfs_aops; 472 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 473 inode->i_mapping->private_data = resv_map; 474 info = HUGETLBFS_I(inode); 475 /* 476 * The policy is initialized here even if we are creating a 477 * private inode because initialization simply creates an 478 * an empty rb tree and calls spin_lock_init(), later when we 479 * call mpol_free_shared_policy() it will just return because 480 * the rb tree will still be empty. 481 */ 482 mpol_shared_policy_init(&info->policy, NULL); 483 switch (mode & S_IFMT) { 484 default: 485 init_special_inode(inode, mode, dev); 486 break; 487 case S_IFREG: 488 inode->i_op = &hugetlbfs_inode_operations; 489 inode->i_fop = &hugetlbfs_file_operations; 490 break; 491 case S_IFDIR: 492 inode->i_op = &hugetlbfs_dir_inode_operations; 493 inode->i_fop = &simple_dir_operations; 494 495 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 496 inc_nlink(inode); 497 break; 498 case S_IFLNK: 499 inode->i_op = &page_symlink_inode_operations; 500 break; 501 } 502 lockdep_annotate_inode_mutex_key(inode); 503 } else 504 kref_put(&resv_map->refs, resv_map_release); 505 506 return inode; 507 } 508 509 /* 510 * File creation. Allocate an inode, and we're done.. 511 */ 512 static int hugetlbfs_mknod(struct inode *dir, 513 struct dentry *dentry, umode_t mode, dev_t dev) 514 { 515 struct inode *inode; 516 int error = -ENOSPC; 517 518 inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); 519 if (inode) { 520 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 521 d_instantiate(dentry, inode); 522 dget(dentry); /* Extra count - pin the dentry in core */ 523 error = 0; 524 } 525 return error; 526 } 527 528 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 529 { 530 int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); 531 if (!retval) 532 inc_nlink(dir); 533 return retval; 534 } 535 536 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) 537 { 538 return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); 539 } 540 541 static int hugetlbfs_symlink(struct inode *dir, 542 struct dentry *dentry, const char *symname) 543 { 544 struct inode *inode; 545 int error = -ENOSPC; 546 547 inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); 548 if (inode) { 549 int l = strlen(symname)+1; 550 error = page_symlink(inode, symname, l); 551 if (!error) { 552 d_instantiate(dentry, inode); 553 dget(dentry); 554 } else 555 iput(inode); 556 } 557 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 558 559 return error; 560 } 561 562 /* 563 * mark the head page dirty 564 */ 565 static int hugetlbfs_set_page_dirty(struct page *page) 566 { 567 struct page *head = compound_head(page); 568 569 SetPageDirty(head); 570 return 0; 571 } 572 573 static int hugetlbfs_migrate_page(struct address_space *mapping, 574 struct page *newpage, struct page *page, 575 enum migrate_mode mode) 576 { 577 int rc; 578 579 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 580 if (rc != MIGRATEPAGE_SUCCESS) 581 return rc; 582 migrate_page_copy(newpage, page); 583 584 return MIGRATEPAGE_SUCCESS; 585 } 586 587 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 588 { 589 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 590 struct hstate *h = hstate_inode(d_inode(dentry)); 591 592 buf->f_type = HUGETLBFS_MAGIC; 593 buf->f_bsize = huge_page_size(h); 594 if (sbinfo) { 595 spin_lock(&sbinfo->stat_lock); 596 /* If no limits set, just report 0 for max/free/used 597 * blocks, like simple_statfs() */ 598 if (sbinfo->spool) { 599 long free_pages; 600 601 spin_lock(&sbinfo->spool->lock); 602 buf->f_blocks = sbinfo->spool->max_hpages; 603 free_pages = sbinfo->spool->max_hpages 604 - sbinfo->spool->used_hpages; 605 buf->f_bavail = buf->f_bfree = free_pages; 606 spin_unlock(&sbinfo->spool->lock); 607 buf->f_files = sbinfo->max_inodes; 608 buf->f_ffree = sbinfo->free_inodes; 609 } 610 spin_unlock(&sbinfo->stat_lock); 611 } 612 buf->f_namelen = NAME_MAX; 613 return 0; 614 } 615 616 static void hugetlbfs_put_super(struct super_block *sb) 617 { 618 struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); 619 620 if (sbi) { 621 sb->s_fs_info = NULL; 622 623 if (sbi->spool) 624 hugepage_put_subpool(sbi->spool); 625 626 kfree(sbi); 627 } 628 } 629 630 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) 631 { 632 if (sbinfo->free_inodes >= 0) { 633 spin_lock(&sbinfo->stat_lock); 634 if (unlikely(!sbinfo->free_inodes)) { 635 spin_unlock(&sbinfo->stat_lock); 636 return 0; 637 } 638 sbinfo->free_inodes--; 639 spin_unlock(&sbinfo->stat_lock); 640 } 641 642 return 1; 643 } 644 645 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) 646 { 647 if (sbinfo->free_inodes >= 0) { 648 spin_lock(&sbinfo->stat_lock); 649 sbinfo->free_inodes++; 650 spin_unlock(&sbinfo->stat_lock); 651 } 652 } 653 654 655 static struct kmem_cache *hugetlbfs_inode_cachep; 656 657 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 658 { 659 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); 660 struct hugetlbfs_inode_info *p; 661 662 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) 663 return NULL; 664 p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); 665 if (unlikely(!p)) { 666 hugetlbfs_inc_free_inodes(sbinfo); 667 return NULL; 668 } 669 return &p->vfs_inode; 670 } 671 672 static void hugetlbfs_i_callback(struct rcu_head *head) 673 { 674 struct inode *inode = container_of(head, struct inode, i_rcu); 675 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 676 } 677 678 static void hugetlbfs_destroy_inode(struct inode *inode) 679 { 680 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 681 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 682 call_rcu(&inode->i_rcu, hugetlbfs_i_callback); 683 } 684 685 static const struct address_space_operations hugetlbfs_aops = { 686 .write_begin = hugetlbfs_write_begin, 687 .write_end = hugetlbfs_write_end, 688 .set_page_dirty = hugetlbfs_set_page_dirty, 689 .migratepage = hugetlbfs_migrate_page, 690 }; 691 692 693 static void init_once(void *foo) 694 { 695 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; 696 697 inode_init_once(&ei->vfs_inode); 698 } 699 700 const struct file_operations hugetlbfs_file_operations = { 701 .read_iter = hugetlbfs_read_iter, 702 .mmap = hugetlbfs_file_mmap, 703 .fsync = noop_fsync, 704 .get_unmapped_area = hugetlb_get_unmapped_area, 705 .llseek = default_llseek, 706 }; 707 708 static const struct inode_operations hugetlbfs_dir_inode_operations = { 709 .create = hugetlbfs_create, 710 .lookup = simple_lookup, 711 .link = simple_link, 712 .unlink = simple_unlink, 713 .symlink = hugetlbfs_symlink, 714 .mkdir = hugetlbfs_mkdir, 715 .rmdir = simple_rmdir, 716 .mknod = hugetlbfs_mknod, 717 .rename = simple_rename, 718 .setattr = hugetlbfs_setattr, 719 }; 720 721 static const struct inode_operations hugetlbfs_inode_operations = { 722 .setattr = hugetlbfs_setattr, 723 }; 724 725 static const struct super_operations hugetlbfs_ops = { 726 .alloc_inode = hugetlbfs_alloc_inode, 727 .destroy_inode = hugetlbfs_destroy_inode, 728 .evict_inode = hugetlbfs_evict_inode, 729 .statfs = hugetlbfs_statfs, 730 .put_super = hugetlbfs_put_super, 731 .show_options = generic_show_options, 732 }; 733 734 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; 735 736 /* 737 * Convert size option passed from command line to number of huge pages 738 * in the pool specified by hstate. Size option could be in bytes 739 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). 740 */ 741 static long long 742 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, 743 int val_type) 744 { 745 if (val_type == NO_SIZE) 746 return -1; 747 748 if (val_type == SIZE_PERCENT) { 749 size_opt <<= huge_page_shift(h); 750 size_opt *= h->max_huge_pages; 751 do_div(size_opt, 100); 752 } 753 754 size_opt >>= huge_page_shift(h); 755 return size_opt; 756 } 757 758 static int 759 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 760 { 761 char *p, *rest; 762 substring_t args[MAX_OPT_ARGS]; 763 int option; 764 unsigned long long max_size_opt = 0, min_size_opt = 0; 765 int max_val_type = NO_SIZE, min_val_type = NO_SIZE; 766 767 if (!options) 768 return 0; 769 770 while ((p = strsep(&options, ",")) != NULL) { 771 int token; 772 if (!*p) 773 continue; 774 775 token = match_token(p, tokens, args); 776 switch (token) { 777 case Opt_uid: 778 if (match_int(&args[0], &option)) 779 goto bad_val; 780 pconfig->uid = make_kuid(current_user_ns(), option); 781 if (!uid_valid(pconfig->uid)) 782 goto bad_val; 783 break; 784 785 case Opt_gid: 786 if (match_int(&args[0], &option)) 787 goto bad_val; 788 pconfig->gid = make_kgid(current_user_ns(), option); 789 if (!gid_valid(pconfig->gid)) 790 goto bad_val; 791 break; 792 793 case Opt_mode: 794 if (match_octal(&args[0], &option)) 795 goto bad_val; 796 pconfig->mode = option & 01777U; 797 break; 798 799 case Opt_size: { 800 /* memparse() will accept a K/M/G without a digit */ 801 if (!isdigit(*args[0].from)) 802 goto bad_val; 803 max_size_opt = memparse(args[0].from, &rest); 804 max_val_type = SIZE_STD; 805 if (*rest == '%') 806 max_val_type = SIZE_PERCENT; 807 break; 808 } 809 810 case Opt_nr_inodes: 811 /* memparse() will accept a K/M/G without a digit */ 812 if (!isdigit(*args[0].from)) 813 goto bad_val; 814 pconfig->nr_inodes = memparse(args[0].from, &rest); 815 break; 816 817 case Opt_pagesize: { 818 unsigned long ps; 819 ps = memparse(args[0].from, &rest); 820 pconfig->hstate = size_to_hstate(ps); 821 if (!pconfig->hstate) { 822 pr_err("Unsupported page size %lu MB\n", 823 ps >> 20); 824 return -EINVAL; 825 } 826 break; 827 } 828 829 case Opt_min_size: { 830 /* memparse() will accept a K/M/G without a digit */ 831 if (!isdigit(*args[0].from)) 832 goto bad_val; 833 min_size_opt = memparse(args[0].from, &rest); 834 min_val_type = SIZE_STD; 835 if (*rest == '%') 836 min_val_type = SIZE_PERCENT; 837 break; 838 } 839 840 default: 841 pr_err("Bad mount option: \"%s\"\n", p); 842 return -EINVAL; 843 break; 844 } 845 } 846 847 /* 848 * Use huge page pool size (in hstate) to convert the size 849 * options to number of huge pages. If NO_SIZE, -1 is returned. 850 */ 851 pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, 852 max_size_opt, max_val_type); 853 pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, 854 min_size_opt, min_val_type); 855 856 /* 857 * If max_size was specified, then min_size must be smaller 858 */ 859 if (max_val_type > NO_SIZE && 860 pconfig->min_hpages > pconfig->max_hpages) { 861 pr_err("minimum size can not be greater than maximum size\n"); 862 return -EINVAL; 863 } 864 865 return 0; 866 867 bad_val: 868 pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); 869 return -EINVAL; 870 } 871 872 static int 873 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) 874 { 875 int ret; 876 struct hugetlbfs_config config; 877 struct hugetlbfs_sb_info *sbinfo; 878 879 save_mount_options(sb, data); 880 881 config.max_hpages = -1; /* No limit on size by default */ 882 config.nr_inodes = -1; /* No limit on number of inodes by default */ 883 config.uid = current_fsuid(); 884 config.gid = current_fsgid(); 885 config.mode = 0755; 886 config.hstate = &default_hstate; 887 config.min_hpages = -1; /* No default minimum size */ 888 ret = hugetlbfs_parse_options(data, &config); 889 if (ret) 890 return ret; 891 892 sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); 893 if (!sbinfo) 894 return -ENOMEM; 895 sb->s_fs_info = sbinfo; 896 sbinfo->hstate = config.hstate; 897 spin_lock_init(&sbinfo->stat_lock); 898 sbinfo->max_inodes = config.nr_inodes; 899 sbinfo->free_inodes = config.nr_inodes; 900 sbinfo->spool = NULL; 901 /* 902 * Allocate and initialize subpool if maximum or minimum size is 903 * specified. Any needed reservations (for minimim size) are taken 904 * taken when the subpool is created. 905 */ 906 if (config.max_hpages != -1 || config.min_hpages != -1) { 907 sbinfo->spool = hugepage_new_subpool(config.hstate, 908 config.max_hpages, 909 config.min_hpages); 910 if (!sbinfo->spool) 911 goto out_free; 912 } 913 sb->s_maxbytes = MAX_LFS_FILESIZE; 914 sb->s_blocksize = huge_page_size(config.hstate); 915 sb->s_blocksize_bits = huge_page_shift(config.hstate); 916 sb->s_magic = HUGETLBFS_MAGIC; 917 sb->s_op = &hugetlbfs_ops; 918 sb->s_time_gran = 1; 919 sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); 920 if (!sb->s_root) 921 goto out_free; 922 return 0; 923 out_free: 924 kfree(sbinfo->spool); 925 kfree(sbinfo); 926 return -ENOMEM; 927 } 928 929 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, 930 int flags, const char *dev_name, void *data) 931 { 932 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); 933 } 934 935 static struct file_system_type hugetlbfs_fs_type = { 936 .name = "hugetlbfs", 937 .mount = hugetlbfs_mount, 938 .kill_sb = kill_litter_super, 939 }; 940 MODULE_ALIAS_FS("hugetlbfs"); 941 942 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; 943 944 static int can_do_hugetlb_shm(void) 945 { 946 kgid_t shm_group; 947 shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); 948 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 949 } 950 951 static int get_hstate_idx(int page_size_log) 952 { 953 struct hstate *h = hstate_sizelog(page_size_log); 954 955 if (!h) 956 return -1; 957 return h - hstates; 958 } 959 960 static const struct dentry_operations anon_ops = { 961 .d_dname = simple_dname 962 }; 963 964 /* 965 * Note that size should be aligned to proper hugepage size in caller side, 966 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. 967 */ 968 struct file *hugetlb_file_setup(const char *name, size_t size, 969 vm_flags_t acctflag, struct user_struct **user, 970 int creat_flags, int page_size_log) 971 { 972 struct file *file = ERR_PTR(-ENOMEM); 973 struct inode *inode; 974 struct path path; 975 struct super_block *sb; 976 struct qstr quick_string; 977 int hstate_idx; 978 979 hstate_idx = get_hstate_idx(page_size_log); 980 if (hstate_idx < 0) 981 return ERR_PTR(-ENODEV); 982 983 *user = NULL; 984 if (!hugetlbfs_vfsmount[hstate_idx]) 985 return ERR_PTR(-ENOENT); 986 987 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 988 *user = current_user(); 989 if (user_shm_lock(size, *user)) { 990 task_lock(current); 991 pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", 992 current->comm, current->pid); 993 task_unlock(current); 994 } else { 995 *user = NULL; 996 return ERR_PTR(-EPERM); 997 } 998 } 999 1000 sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; 1001 quick_string.name = name; 1002 quick_string.len = strlen(quick_string.name); 1003 quick_string.hash = 0; 1004 path.dentry = d_alloc_pseudo(sb, &quick_string); 1005 if (!path.dentry) 1006 goto out_shm_unlock; 1007 1008 d_set_d_op(path.dentry, &anon_ops); 1009 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); 1010 file = ERR_PTR(-ENOSPC); 1011 inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); 1012 if (!inode) 1013 goto out_dentry; 1014 1015 file = ERR_PTR(-ENOMEM); 1016 if (hugetlb_reserve_pages(inode, 0, 1017 size >> huge_page_shift(hstate_inode(inode)), NULL, 1018 acctflag)) 1019 goto out_inode; 1020 1021 d_instantiate(path.dentry, inode); 1022 inode->i_size = size; 1023 clear_nlink(inode); 1024 1025 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 1026 &hugetlbfs_file_operations); 1027 if (IS_ERR(file)) 1028 goto out_dentry; /* inode is already attached */ 1029 1030 return file; 1031 1032 out_inode: 1033 iput(inode); 1034 out_dentry: 1035 path_put(&path); 1036 out_shm_unlock: 1037 if (*user) { 1038 user_shm_unlock(size, *user); 1039 *user = NULL; 1040 } 1041 return file; 1042 } 1043 1044 static int __init init_hugetlbfs_fs(void) 1045 { 1046 struct hstate *h; 1047 int error; 1048 int i; 1049 1050 if (!hugepages_supported()) { 1051 pr_info("disabling because there are no supported hugepage sizes\n"); 1052 return -ENOTSUPP; 1053 } 1054 1055 error = -ENOMEM; 1056 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1057 sizeof(struct hugetlbfs_inode_info), 1058 0, 0, init_once); 1059 if (hugetlbfs_inode_cachep == NULL) 1060 goto out2; 1061 1062 error = register_filesystem(&hugetlbfs_fs_type); 1063 if (error) 1064 goto out; 1065 1066 i = 0; 1067 for_each_hstate(h) { 1068 char buf[50]; 1069 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); 1070 1071 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); 1072 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, 1073 buf); 1074 1075 if (IS_ERR(hugetlbfs_vfsmount[i])) { 1076 pr_err("Cannot mount internal hugetlbfs for " 1077 "page size %uK", ps_kb); 1078 error = PTR_ERR(hugetlbfs_vfsmount[i]); 1079 hugetlbfs_vfsmount[i] = NULL; 1080 } 1081 i++; 1082 } 1083 /* Non default hstates are optional */ 1084 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) 1085 return 0; 1086 1087 out: 1088 kmem_cache_destroy(hugetlbfs_inode_cachep); 1089 out2: 1090 return error; 1091 } 1092 1093 static void __exit exit_hugetlbfs_fs(void) 1094 { 1095 struct hstate *h; 1096 int i; 1097 1098 1099 /* 1100 * Make sure all delayed rcu free inodes are flushed before we 1101 * destroy cache. 1102 */ 1103 rcu_barrier(); 1104 kmem_cache_destroy(hugetlbfs_inode_cachep); 1105 i = 0; 1106 for_each_hstate(h) 1107 kern_unmount(hugetlbfs_vfsmount[i++]); 1108 unregister_filesystem(&hugetlbfs_fs_type); 1109 } 1110 1111 module_init(init_hugetlbfs_fs) 1112 module_exit(exit_hugetlbfs_fs) 1113 1114 MODULE_LICENSE("GPL"); 1115