xref: /openbmc/linux/fs/hugetlbfs/inode.c (revision 2572f00d)
1 /*
2  * hugetlbpage-backed filesystem.  Based on ramfs.
3  *
4  * Nadia Yvette Chambers, 2002
5  *
6  * Copyright (C) 2002 Linus Torvalds.
7  */
8 
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/module.h>
12 #include <linux/thread_info.h>
13 #include <asm/current.h>
14 #include <linux/sched.h>		/* remove ASAP */
15 #include <linux/falloc.h>
16 #include <linux/fs.h>
17 #include <linux/mount.h>
18 #include <linux/file.h>
19 #include <linux/kernel.h>
20 #include <linux/writeback.h>
21 #include <linux/pagemap.h>
22 #include <linux/highmem.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/capability.h>
26 #include <linux/ctype.h>
27 #include <linux/backing-dev.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagevec.h>
30 #include <linux/parser.h>
31 #include <linux/mman.h>
32 #include <linux/slab.h>
33 #include <linux/dnotify.h>
34 #include <linux/statfs.h>
35 #include <linux/security.h>
36 #include <linux/magic.h>
37 #include <linux/migrate.h>
38 #include <linux/uio.h>
39 
40 #include <asm/uaccess.h>
41 
42 static const struct super_operations hugetlbfs_ops;
43 static const struct address_space_operations hugetlbfs_aops;
44 const struct file_operations hugetlbfs_file_operations;
45 static const struct inode_operations hugetlbfs_dir_inode_operations;
46 static const struct inode_operations hugetlbfs_inode_operations;
47 
48 struct hugetlbfs_config {
49 	kuid_t   uid;
50 	kgid_t   gid;
51 	umode_t mode;
52 	long	max_hpages;
53 	long	nr_inodes;
54 	struct hstate *hstate;
55 	long    min_hpages;
56 };
57 
58 struct hugetlbfs_inode_info {
59 	struct shared_policy policy;
60 	struct inode vfs_inode;
61 };
62 
63 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
64 {
65 	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
66 }
67 
68 int sysctl_hugetlb_shm_group;
69 
70 enum {
71 	Opt_size, Opt_nr_inodes,
72 	Opt_mode, Opt_uid, Opt_gid,
73 	Opt_pagesize, Opt_min_size,
74 	Opt_err,
75 };
76 
77 static const match_table_t tokens = {
78 	{Opt_size,	"size=%s"},
79 	{Opt_nr_inodes,	"nr_inodes=%s"},
80 	{Opt_mode,	"mode=%o"},
81 	{Opt_uid,	"uid=%u"},
82 	{Opt_gid,	"gid=%u"},
83 	{Opt_pagesize,	"pagesize=%s"},
84 	{Opt_min_size,	"min_size=%s"},
85 	{Opt_err,	NULL},
86 };
87 
88 #ifdef CONFIG_NUMA
89 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
90 					struct inode *inode, pgoff_t index)
91 {
92 	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
93 							index);
94 }
95 
96 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
97 {
98 	mpol_cond_put(vma->vm_policy);
99 }
100 #else
101 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
102 					struct inode *inode, pgoff_t index)
103 {
104 }
105 
106 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
107 {
108 }
109 #endif
110 
111 static void huge_pagevec_release(struct pagevec *pvec)
112 {
113 	int i;
114 
115 	for (i = 0; i < pagevec_count(pvec); ++i)
116 		put_page(pvec->pages[i]);
117 
118 	pagevec_reinit(pvec);
119 }
120 
121 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
122 {
123 	struct inode *inode = file_inode(file);
124 	loff_t len, vma_len;
125 	int ret;
126 	struct hstate *h = hstate_file(file);
127 
128 	/*
129 	 * vma address alignment (but not the pgoff alignment) has
130 	 * already been checked by prepare_hugepage_range.  If you add
131 	 * any error returns here, do so after setting VM_HUGETLB, so
132 	 * is_vm_hugetlb_page tests below unmap_region go the right
133 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
134 	 * and ia64).
135 	 */
136 	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
137 	vma->vm_ops = &hugetlb_vm_ops;
138 
139 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
140 		return -EINVAL;
141 
142 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
143 
144 	mutex_lock(&inode->i_mutex);
145 	file_accessed(file);
146 
147 	ret = -ENOMEM;
148 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
149 
150 	if (hugetlb_reserve_pages(inode,
151 				vma->vm_pgoff >> huge_page_order(h),
152 				len >> huge_page_shift(h), vma,
153 				vma->vm_flags))
154 		goto out;
155 
156 	ret = 0;
157 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
158 		inode->i_size = len;
159 out:
160 	mutex_unlock(&inode->i_mutex);
161 
162 	return ret;
163 }
164 
165 /*
166  * Called under down_write(mmap_sem).
167  */
168 
169 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
170 static unsigned long
171 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
172 		unsigned long len, unsigned long pgoff, unsigned long flags)
173 {
174 	struct mm_struct *mm = current->mm;
175 	struct vm_area_struct *vma;
176 	struct hstate *h = hstate_file(file);
177 	struct vm_unmapped_area_info info;
178 
179 	if (len & ~huge_page_mask(h))
180 		return -EINVAL;
181 	if (len > TASK_SIZE)
182 		return -ENOMEM;
183 
184 	if (flags & MAP_FIXED) {
185 		if (prepare_hugepage_range(file, addr, len))
186 			return -EINVAL;
187 		return addr;
188 	}
189 
190 	if (addr) {
191 		addr = ALIGN(addr, huge_page_size(h));
192 		vma = find_vma(mm, addr);
193 		if (TASK_SIZE - len >= addr &&
194 		    (!vma || addr + len <= vma->vm_start))
195 			return addr;
196 	}
197 
198 	info.flags = 0;
199 	info.length = len;
200 	info.low_limit = TASK_UNMAPPED_BASE;
201 	info.high_limit = TASK_SIZE;
202 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
203 	info.align_offset = 0;
204 	return vm_unmapped_area(&info);
205 }
206 #endif
207 
208 static size_t
209 hugetlbfs_read_actor(struct page *page, unsigned long offset,
210 			struct iov_iter *to, unsigned long size)
211 {
212 	size_t copied = 0;
213 	int i, chunksize;
214 
215 	/* Find which 4k chunk and offset with in that chunk */
216 	i = offset >> PAGE_CACHE_SHIFT;
217 	offset = offset & ~PAGE_CACHE_MASK;
218 
219 	while (size) {
220 		size_t n;
221 		chunksize = PAGE_CACHE_SIZE;
222 		if (offset)
223 			chunksize -= offset;
224 		if (chunksize > size)
225 			chunksize = size;
226 		n = copy_page_to_iter(&page[i], offset, chunksize, to);
227 		copied += n;
228 		if (n != chunksize)
229 			return copied;
230 		offset = 0;
231 		size -= chunksize;
232 		i++;
233 	}
234 	return copied;
235 }
236 
237 /*
238  * Support for read() - Find the page attached to f_mapping and copy out the
239  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
240  * since it has PAGE_CACHE_SIZE assumptions.
241  */
242 static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
243 {
244 	struct file *file = iocb->ki_filp;
245 	struct hstate *h = hstate_file(file);
246 	struct address_space *mapping = file->f_mapping;
247 	struct inode *inode = mapping->host;
248 	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
249 	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
250 	unsigned long end_index;
251 	loff_t isize;
252 	ssize_t retval = 0;
253 
254 	while (iov_iter_count(to)) {
255 		struct page *page;
256 		size_t nr, copied;
257 
258 		/* nr is the maximum number of bytes to copy from this page */
259 		nr = huge_page_size(h);
260 		isize = i_size_read(inode);
261 		if (!isize)
262 			break;
263 		end_index = (isize - 1) >> huge_page_shift(h);
264 		if (index > end_index)
265 			break;
266 		if (index == end_index) {
267 			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
268 			if (nr <= offset)
269 				break;
270 		}
271 		nr = nr - offset;
272 
273 		/* Find the page */
274 		page = find_lock_page(mapping, index);
275 		if (unlikely(page == NULL)) {
276 			/*
277 			 * We have a HOLE, zero out the user-buffer for the
278 			 * length of the hole or request.
279 			 */
280 			copied = iov_iter_zero(nr, to);
281 		} else {
282 			unlock_page(page);
283 
284 			/*
285 			 * We have the page, copy it to user space buffer.
286 			 */
287 			copied = hugetlbfs_read_actor(page, offset, to, nr);
288 			page_cache_release(page);
289 		}
290 		offset += copied;
291 		retval += copied;
292 		if (copied != nr && iov_iter_count(to)) {
293 			if (!retval)
294 				retval = -EFAULT;
295 			break;
296 		}
297 		index += offset >> huge_page_shift(h);
298 		offset &= ~huge_page_mask(h);
299 	}
300 	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
301 	return retval;
302 }
303 
304 static int hugetlbfs_write_begin(struct file *file,
305 			struct address_space *mapping,
306 			loff_t pos, unsigned len, unsigned flags,
307 			struct page **pagep, void **fsdata)
308 {
309 	return -EINVAL;
310 }
311 
312 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
313 			loff_t pos, unsigned len, unsigned copied,
314 			struct page *page, void *fsdata)
315 {
316 	BUG();
317 	return -EINVAL;
318 }
319 
320 static void remove_huge_page(struct page *page)
321 {
322 	ClearPageDirty(page);
323 	ClearPageUptodate(page);
324 	delete_from_page_cache(page);
325 }
326 
327 
328 /*
329  * remove_inode_hugepages handles two distinct cases: truncation and hole
330  * punch.  There are subtle differences in operation for each case.
331 
332  * truncation is indicated by end of range being LLONG_MAX
333  *	In this case, we first scan the range and release found pages.
334  *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
335  *	maps and global counts.  Page faults can not race with truncation
336  *	in this routine.  hugetlb_no_page() prevents page faults in the
337  *	truncated range.  It checks i_size before allocation, and again after
338  *	with the page table lock for the page held.  The same lock must be
339  *	acquired to unmap a page.
340  * hole punch is indicated if end is not LLONG_MAX
341  *	In the hole punch case we scan the range and release found pages.
342  *	Only when releasing a page is the associated region/reserv map
343  *	deleted.  The region/reserv map for ranges without associated
344  *	pages are not modified.  Page faults can race with hole punch.
345  *	This is indicated if we find a mapped page.
346  * Note: If the passed end of range value is beyond the end of file, but
347  * not LLONG_MAX this routine still performs a hole punch operation.
348  */
349 static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
350 				   loff_t lend)
351 {
352 	struct hstate *h = hstate_inode(inode);
353 	struct address_space *mapping = &inode->i_data;
354 	const pgoff_t start = lstart >> huge_page_shift(h);
355 	const pgoff_t end = lend >> huge_page_shift(h);
356 	struct vm_area_struct pseudo_vma;
357 	struct pagevec pvec;
358 	pgoff_t next;
359 	int i, freed = 0;
360 	long lookup_nr = PAGEVEC_SIZE;
361 	bool truncate_op = (lend == LLONG_MAX);
362 
363 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
364 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
365 	pagevec_init(&pvec, 0);
366 	next = start;
367 	while (next < end) {
368 		/*
369 		 * Don't grab more pages than the number left in the range.
370 		 */
371 		if (end - next < lookup_nr)
372 			lookup_nr = end - next;
373 
374 		/*
375 		 * When no more pages are found, we are done.
376 		 */
377 		if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
378 			break;
379 
380 		for (i = 0; i < pagevec_count(&pvec); ++i) {
381 			struct page *page = pvec.pages[i];
382 			u32 hash;
383 
384 			/*
385 			 * The page (index) could be beyond end.  This is
386 			 * only possible in the punch hole case as end is
387 			 * max page offset in the truncate case.
388 			 */
389 			next = page->index;
390 			if (next >= end)
391 				break;
392 
393 			hash = hugetlb_fault_mutex_hash(h, current->mm,
394 							&pseudo_vma,
395 							mapping, next, 0);
396 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
397 
398 			lock_page(page);
399 			if (likely(!page_mapped(page))) {
400 				bool rsv_on_error = !PagePrivate(page);
401 				/*
402 				 * We must free the huge page and remove
403 				 * from page cache (remove_huge_page) BEFORE
404 				 * removing the region/reserve map
405 				 * (hugetlb_unreserve_pages).  In rare out
406 				 * of memory conditions, removal of the
407 				 * region/reserve map could fail.  Before
408 				 * free'ing the page, note PagePrivate which
409 				 * is used in case of error.
410 				 */
411 				remove_huge_page(page);
412 				freed++;
413 				if (!truncate_op) {
414 					if (unlikely(hugetlb_unreserve_pages(
415 							inode, next,
416 							next + 1, 1)))
417 						hugetlb_fix_reserve_counts(
418 							inode, rsv_on_error);
419 				}
420 			} else {
421 				/*
422 				 * If page is mapped, it was faulted in after
423 				 * being unmapped.  It indicates a race between
424 				 * hole punch and page fault.  Do nothing in
425 				 * this case.  Getting here in a truncate
426 				 * operation is a bug.
427 				 */
428 				BUG_ON(truncate_op);
429 			}
430 
431 			unlock_page(page);
432 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
433 		}
434 		++next;
435 		huge_pagevec_release(&pvec);
436 		cond_resched();
437 	}
438 
439 	if (truncate_op)
440 		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
441 }
442 
443 static void hugetlbfs_evict_inode(struct inode *inode)
444 {
445 	struct resv_map *resv_map;
446 
447 	remove_inode_hugepages(inode, 0, LLONG_MAX);
448 	resv_map = (struct resv_map *)inode->i_mapping->private_data;
449 	/* root inode doesn't have the resv_map, so we should check it */
450 	if (resv_map)
451 		resv_map_release(&resv_map->refs);
452 	clear_inode(inode);
453 }
454 
455 static inline void
456 hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
457 {
458 	struct vm_area_struct *vma;
459 
460 	/*
461 	 * end == 0 indicates that the entire range after
462 	 * start should be unmapped.
463 	 */
464 	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
465 		unsigned long v_offset;
466 
467 		/*
468 		 * Can the expression below overflow on 32-bit arches?
469 		 * No, because the interval tree returns us only those vmas
470 		 * which overlap the truncated area starting at pgoff,
471 		 * and no vma on a 32-bit arch can span beyond the 4GB.
472 		 */
473 		if (vma->vm_pgoff < start)
474 			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
475 		else
476 			v_offset = 0;
477 
478 		if (end) {
479 			end = ((end - start) << PAGE_SHIFT) +
480 			       vma->vm_start + v_offset;
481 			if (end > vma->vm_end)
482 				end = vma->vm_end;
483 		} else
484 			end = vma->vm_end;
485 
486 		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
487 	}
488 }
489 
490 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
491 {
492 	pgoff_t pgoff;
493 	struct address_space *mapping = inode->i_mapping;
494 	struct hstate *h = hstate_inode(inode);
495 
496 	BUG_ON(offset & ~huge_page_mask(h));
497 	pgoff = offset >> PAGE_SHIFT;
498 
499 	i_size_write(inode, offset);
500 	i_mmap_lock_write(mapping);
501 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
502 		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
503 	i_mmap_unlock_write(mapping);
504 	remove_inode_hugepages(inode, offset, LLONG_MAX);
505 	return 0;
506 }
507 
508 static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
509 {
510 	struct hstate *h = hstate_inode(inode);
511 	loff_t hpage_size = huge_page_size(h);
512 	loff_t hole_start, hole_end;
513 
514 	/*
515 	 * For hole punch round up the beginning offset of the hole and
516 	 * round down the end.
517 	 */
518 	hole_start = round_up(offset, hpage_size);
519 	hole_end = round_down(offset + len, hpage_size);
520 
521 	if (hole_end > hole_start) {
522 		struct address_space *mapping = inode->i_mapping;
523 
524 		mutex_lock(&inode->i_mutex);
525 		i_mmap_lock_write(mapping);
526 		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
527 			hugetlb_vmdelete_list(&mapping->i_mmap,
528 						hole_start >> PAGE_SHIFT,
529 						hole_end  >> PAGE_SHIFT);
530 		i_mmap_unlock_write(mapping);
531 		remove_inode_hugepages(inode, hole_start, hole_end);
532 		mutex_unlock(&inode->i_mutex);
533 	}
534 
535 	return 0;
536 }
537 
538 static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
539 				loff_t len)
540 {
541 	struct inode *inode = file_inode(file);
542 	struct address_space *mapping = inode->i_mapping;
543 	struct hstate *h = hstate_inode(inode);
544 	struct vm_area_struct pseudo_vma;
545 	struct mm_struct *mm = current->mm;
546 	loff_t hpage_size = huge_page_size(h);
547 	unsigned long hpage_shift = huge_page_shift(h);
548 	pgoff_t start, index, end;
549 	int error;
550 	u32 hash;
551 
552 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
553 		return -EOPNOTSUPP;
554 
555 	if (mode & FALLOC_FL_PUNCH_HOLE)
556 		return hugetlbfs_punch_hole(inode, offset, len);
557 
558 	/*
559 	 * Default preallocate case.
560 	 * For this range, start is rounded down and end is rounded up
561 	 * as well as being converted to page offsets.
562 	 */
563 	start = offset >> hpage_shift;
564 	end = (offset + len + hpage_size - 1) >> hpage_shift;
565 
566 	mutex_lock(&inode->i_mutex);
567 
568 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
569 	error = inode_newsize_ok(inode, offset + len);
570 	if (error)
571 		goto out;
572 
573 	/*
574 	 * Initialize a pseudo vma as this is required by the huge page
575 	 * allocation routines.  If NUMA is configured, use page index
576 	 * as input to create an allocation policy.
577 	 */
578 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
579 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
580 	pseudo_vma.vm_file = file;
581 
582 	for (index = start; index < end; index++) {
583 		/*
584 		 * This is supposed to be the vaddr where the page is being
585 		 * faulted in, but we have no vaddr here.
586 		 */
587 		struct page *page;
588 		unsigned long addr;
589 		int avoid_reserve = 0;
590 
591 		cond_resched();
592 
593 		/*
594 		 * fallocate(2) manpage permits EINTR; we may have been
595 		 * interrupted because we are using up too much memory.
596 		 */
597 		if (signal_pending(current)) {
598 			error = -EINTR;
599 			break;
600 		}
601 
602 		/* Set numa allocation policy based on index */
603 		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
604 
605 		/* addr is the offset within the file (zero based) */
606 		addr = index * hpage_size;
607 
608 		/* mutex taken here, fault path and hole punch */
609 		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
610 						index, addr);
611 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
612 
613 		/* See if already present in mapping to avoid alloc/free */
614 		page = find_get_page(mapping, index);
615 		if (page) {
616 			put_page(page);
617 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
618 			hugetlb_drop_vma_policy(&pseudo_vma);
619 			continue;
620 		}
621 
622 		/* Allocate page and add to page cache */
623 		page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
624 		hugetlb_drop_vma_policy(&pseudo_vma);
625 		if (IS_ERR(page)) {
626 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
627 			error = PTR_ERR(page);
628 			goto out;
629 		}
630 		clear_huge_page(page, addr, pages_per_huge_page(h));
631 		__SetPageUptodate(page);
632 		error = huge_add_to_page_cache(page, mapping, index);
633 		if (unlikely(error)) {
634 			put_page(page);
635 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
636 			goto out;
637 		}
638 
639 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
640 
641 		/*
642 		 * page_put due to reference from alloc_huge_page()
643 		 * unlock_page because locked by add_to_page_cache()
644 		 */
645 		put_page(page);
646 		unlock_page(page);
647 	}
648 
649 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
650 		i_size_write(inode, offset + len);
651 	inode->i_ctime = CURRENT_TIME;
652 out:
653 	mutex_unlock(&inode->i_mutex);
654 	return error;
655 }
656 
657 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
658 {
659 	struct inode *inode = d_inode(dentry);
660 	struct hstate *h = hstate_inode(inode);
661 	int error;
662 	unsigned int ia_valid = attr->ia_valid;
663 
664 	BUG_ON(!inode);
665 
666 	error = inode_change_ok(inode, attr);
667 	if (error)
668 		return error;
669 
670 	if (ia_valid & ATTR_SIZE) {
671 		error = -EINVAL;
672 		if (attr->ia_size & ~huge_page_mask(h))
673 			return -EINVAL;
674 		error = hugetlb_vmtruncate(inode, attr->ia_size);
675 		if (error)
676 			return error;
677 	}
678 
679 	setattr_copy(inode, attr);
680 	mark_inode_dirty(inode);
681 	return 0;
682 }
683 
684 static struct inode *hugetlbfs_get_root(struct super_block *sb,
685 					struct hugetlbfs_config *config)
686 {
687 	struct inode *inode;
688 
689 	inode = new_inode(sb);
690 	if (inode) {
691 		struct hugetlbfs_inode_info *info;
692 		inode->i_ino = get_next_ino();
693 		inode->i_mode = S_IFDIR | config->mode;
694 		inode->i_uid = config->uid;
695 		inode->i_gid = config->gid;
696 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
697 		info = HUGETLBFS_I(inode);
698 		mpol_shared_policy_init(&info->policy, NULL);
699 		inode->i_op = &hugetlbfs_dir_inode_operations;
700 		inode->i_fop = &simple_dir_operations;
701 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
702 		inc_nlink(inode);
703 		lockdep_annotate_inode_mutex_key(inode);
704 	}
705 	return inode;
706 }
707 
708 /*
709  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
710  * be taken from reclaim -- unlike regular filesystems. This needs an
711  * annotation because huge_pmd_share() does an allocation under
712  * i_mmap_rwsem.
713  */
714 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
715 
716 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
717 					struct inode *dir,
718 					umode_t mode, dev_t dev)
719 {
720 	struct inode *inode;
721 	struct resv_map *resv_map;
722 
723 	resv_map = resv_map_alloc();
724 	if (!resv_map)
725 		return NULL;
726 
727 	inode = new_inode(sb);
728 	if (inode) {
729 		struct hugetlbfs_inode_info *info;
730 		inode->i_ino = get_next_ino();
731 		inode_init_owner(inode, dir, mode);
732 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
733 				&hugetlbfs_i_mmap_rwsem_key);
734 		inode->i_mapping->a_ops = &hugetlbfs_aops;
735 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
736 		inode->i_mapping->private_data = resv_map;
737 		info = HUGETLBFS_I(inode);
738 		/*
739 		 * The policy is initialized here even if we are creating a
740 		 * private inode because initialization simply creates an
741 		 * an empty rb tree and calls spin_lock_init(), later when we
742 		 * call mpol_free_shared_policy() it will just return because
743 		 * the rb tree will still be empty.
744 		 */
745 		mpol_shared_policy_init(&info->policy, NULL);
746 		switch (mode & S_IFMT) {
747 		default:
748 			init_special_inode(inode, mode, dev);
749 			break;
750 		case S_IFREG:
751 			inode->i_op = &hugetlbfs_inode_operations;
752 			inode->i_fop = &hugetlbfs_file_operations;
753 			break;
754 		case S_IFDIR:
755 			inode->i_op = &hugetlbfs_dir_inode_operations;
756 			inode->i_fop = &simple_dir_operations;
757 
758 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
759 			inc_nlink(inode);
760 			break;
761 		case S_IFLNK:
762 			inode->i_op = &page_symlink_inode_operations;
763 			break;
764 		}
765 		lockdep_annotate_inode_mutex_key(inode);
766 	} else
767 		kref_put(&resv_map->refs, resv_map_release);
768 
769 	return inode;
770 }
771 
772 /*
773  * File creation. Allocate an inode, and we're done..
774  */
775 static int hugetlbfs_mknod(struct inode *dir,
776 			struct dentry *dentry, umode_t mode, dev_t dev)
777 {
778 	struct inode *inode;
779 	int error = -ENOSPC;
780 
781 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
782 	if (inode) {
783 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
784 		d_instantiate(dentry, inode);
785 		dget(dentry);	/* Extra count - pin the dentry in core */
786 		error = 0;
787 	}
788 	return error;
789 }
790 
791 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
792 {
793 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
794 	if (!retval)
795 		inc_nlink(dir);
796 	return retval;
797 }
798 
799 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
800 {
801 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
802 }
803 
804 static int hugetlbfs_symlink(struct inode *dir,
805 			struct dentry *dentry, const char *symname)
806 {
807 	struct inode *inode;
808 	int error = -ENOSPC;
809 
810 	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
811 	if (inode) {
812 		int l = strlen(symname)+1;
813 		error = page_symlink(inode, symname, l);
814 		if (!error) {
815 			d_instantiate(dentry, inode);
816 			dget(dentry);
817 		} else
818 			iput(inode);
819 	}
820 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
821 
822 	return error;
823 }
824 
825 /*
826  * mark the head page dirty
827  */
828 static int hugetlbfs_set_page_dirty(struct page *page)
829 {
830 	struct page *head = compound_head(page);
831 
832 	SetPageDirty(head);
833 	return 0;
834 }
835 
836 static int hugetlbfs_migrate_page(struct address_space *mapping,
837 				struct page *newpage, struct page *page,
838 				enum migrate_mode mode)
839 {
840 	int rc;
841 
842 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
843 	if (rc != MIGRATEPAGE_SUCCESS)
844 		return rc;
845 	migrate_page_copy(newpage, page);
846 
847 	return MIGRATEPAGE_SUCCESS;
848 }
849 
850 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
851 {
852 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
853 	struct hstate *h = hstate_inode(d_inode(dentry));
854 
855 	buf->f_type = HUGETLBFS_MAGIC;
856 	buf->f_bsize = huge_page_size(h);
857 	if (sbinfo) {
858 		spin_lock(&sbinfo->stat_lock);
859 		/* If no limits set, just report 0 for max/free/used
860 		 * blocks, like simple_statfs() */
861 		if (sbinfo->spool) {
862 			long free_pages;
863 
864 			spin_lock(&sbinfo->spool->lock);
865 			buf->f_blocks = sbinfo->spool->max_hpages;
866 			free_pages = sbinfo->spool->max_hpages
867 				- sbinfo->spool->used_hpages;
868 			buf->f_bavail = buf->f_bfree = free_pages;
869 			spin_unlock(&sbinfo->spool->lock);
870 			buf->f_files = sbinfo->max_inodes;
871 			buf->f_ffree = sbinfo->free_inodes;
872 		}
873 		spin_unlock(&sbinfo->stat_lock);
874 	}
875 	buf->f_namelen = NAME_MAX;
876 	return 0;
877 }
878 
879 static void hugetlbfs_put_super(struct super_block *sb)
880 {
881 	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
882 
883 	if (sbi) {
884 		sb->s_fs_info = NULL;
885 
886 		if (sbi->spool)
887 			hugepage_put_subpool(sbi->spool);
888 
889 		kfree(sbi);
890 	}
891 }
892 
893 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
894 {
895 	if (sbinfo->free_inodes >= 0) {
896 		spin_lock(&sbinfo->stat_lock);
897 		if (unlikely(!sbinfo->free_inodes)) {
898 			spin_unlock(&sbinfo->stat_lock);
899 			return 0;
900 		}
901 		sbinfo->free_inodes--;
902 		spin_unlock(&sbinfo->stat_lock);
903 	}
904 
905 	return 1;
906 }
907 
908 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
909 {
910 	if (sbinfo->free_inodes >= 0) {
911 		spin_lock(&sbinfo->stat_lock);
912 		sbinfo->free_inodes++;
913 		spin_unlock(&sbinfo->stat_lock);
914 	}
915 }
916 
917 
918 static struct kmem_cache *hugetlbfs_inode_cachep;
919 
920 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
921 {
922 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
923 	struct hugetlbfs_inode_info *p;
924 
925 	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
926 		return NULL;
927 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
928 	if (unlikely(!p)) {
929 		hugetlbfs_inc_free_inodes(sbinfo);
930 		return NULL;
931 	}
932 	return &p->vfs_inode;
933 }
934 
935 static void hugetlbfs_i_callback(struct rcu_head *head)
936 {
937 	struct inode *inode = container_of(head, struct inode, i_rcu);
938 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
939 }
940 
941 static void hugetlbfs_destroy_inode(struct inode *inode)
942 {
943 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
944 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
945 	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
946 }
947 
948 static const struct address_space_operations hugetlbfs_aops = {
949 	.write_begin	= hugetlbfs_write_begin,
950 	.write_end	= hugetlbfs_write_end,
951 	.set_page_dirty	= hugetlbfs_set_page_dirty,
952 	.migratepage    = hugetlbfs_migrate_page,
953 };
954 
955 
956 static void init_once(void *foo)
957 {
958 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
959 
960 	inode_init_once(&ei->vfs_inode);
961 }
962 
963 const struct file_operations hugetlbfs_file_operations = {
964 	.read_iter		= hugetlbfs_read_iter,
965 	.mmap			= hugetlbfs_file_mmap,
966 	.fsync			= noop_fsync,
967 	.get_unmapped_area	= hugetlb_get_unmapped_area,
968 	.llseek			= default_llseek,
969 	.fallocate		= hugetlbfs_fallocate,
970 };
971 
972 static const struct inode_operations hugetlbfs_dir_inode_operations = {
973 	.create		= hugetlbfs_create,
974 	.lookup		= simple_lookup,
975 	.link		= simple_link,
976 	.unlink		= simple_unlink,
977 	.symlink	= hugetlbfs_symlink,
978 	.mkdir		= hugetlbfs_mkdir,
979 	.rmdir		= simple_rmdir,
980 	.mknod		= hugetlbfs_mknod,
981 	.rename		= simple_rename,
982 	.setattr	= hugetlbfs_setattr,
983 };
984 
985 static const struct inode_operations hugetlbfs_inode_operations = {
986 	.setattr	= hugetlbfs_setattr,
987 };
988 
989 static const struct super_operations hugetlbfs_ops = {
990 	.alloc_inode    = hugetlbfs_alloc_inode,
991 	.destroy_inode  = hugetlbfs_destroy_inode,
992 	.evict_inode	= hugetlbfs_evict_inode,
993 	.statfs		= hugetlbfs_statfs,
994 	.put_super	= hugetlbfs_put_super,
995 	.show_options	= generic_show_options,
996 };
997 
998 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
999 
1000 /*
1001  * Convert size option passed from command line to number of huge pages
1002  * in the pool specified by hstate.  Size option could be in bytes
1003  * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1004  */
1005 static long long
1006 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
1007 								int val_type)
1008 {
1009 	if (val_type == NO_SIZE)
1010 		return -1;
1011 
1012 	if (val_type == SIZE_PERCENT) {
1013 		size_opt <<= huge_page_shift(h);
1014 		size_opt *= h->max_huge_pages;
1015 		do_div(size_opt, 100);
1016 	}
1017 
1018 	size_opt >>= huge_page_shift(h);
1019 	return size_opt;
1020 }
1021 
1022 static int
1023 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
1024 {
1025 	char *p, *rest;
1026 	substring_t args[MAX_OPT_ARGS];
1027 	int option;
1028 	unsigned long long max_size_opt = 0, min_size_opt = 0;
1029 	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
1030 
1031 	if (!options)
1032 		return 0;
1033 
1034 	while ((p = strsep(&options, ",")) != NULL) {
1035 		int token;
1036 		if (!*p)
1037 			continue;
1038 
1039 		token = match_token(p, tokens, args);
1040 		switch (token) {
1041 		case Opt_uid:
1042 			if (match_int(&args[0], &option))
1043  				goto bad_val;
1044 			pconfig->uid = make_kuid(current_user_ns(), option);
1045 			if (!uid_valid(pconfig->uid))
1046 				goto bad_val;
1047 			break;
1048 
1049 		case Opt_gid:
1050 			if (match_int(&args[0], &option))
1051  				goto bad_val;
1052 			pconfig->gid = make_kgid(current_user_ns(), option);
1053 			if (!gid_valid(pconfig->gid))
1054 				goto bad_val;
1055 			break;
1056 
1057 		case Opt_mode:
1058 			if (match_octal(&args[0], &option))
1059  				goto bad_val;
1060 			pconfig->mode = option & 01777U;
1061 			break;
1062 
1063 		case Opt_size: {
1064 			/* memparse() will accept a K/M/G without a digit */
1065 			if (!isdigit(*args[0].from))
1066 				goto bad_val;
1067 			max_size_opt = memparse(args[0].from, &rest);
1068 			max_val_type = SIZE_STD;
1069 			if (*rest == '%')
1070 				max_val_type = SIZE_PERCENT;
1071 			break;
1072 		}
1073 
1074 		case Opt_nr_inodes:
1075 			/* memparse() will accept a K/M/G without a digit */
1076 			if (!isdigit(*args[0].from))
1077 				goto bad_val;
1078 			pconfig->nr_inodes = memparse(args[0].from, &rest);
1079 			break;
1080 
1081 		case Opt_pagesize: {
1082 			unsigned long ps;
1083 			ps = memparse(args[0].from, &rest);
1084 			pconfig->hstate = size_to_hstate(ps);
1085 			if (!pconfig->hstate) {
1086 				pr_err("Unsupported page size %lu MB\n",
1087 					ps >> 20);
1088 				return -EINVAL;
1089 			}
1090 			break;
1091 		}
1092 
1093 		case Opt_min_size: {
1094 			/* memparse() will accept a K/M/G without a digit */
1095 			if (!isdigit(*args[0].from))
1096 				goto bad_val;
1097 			min_size_opt = memparse(args[0].from, &rest);
1098 			min_val_type = SIZE_STD;
1099 			if (*rest == '%')
1100 				min_val_type = SIZE_PERCENT;
1101 			break;
1102 		}
1103 
1104 		default:
1105 			pr_err("Bad mount option: \"%s\"\n", p);
1106 			return -EINVAL;
1107 			break;
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * Use huge page pool size (in hstate) to convert the size
1113 	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
1114 	 */
1115 	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
1116 						max_size_opt, max_val_type);
1117 	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
1118 						min_size_opt, min_val_type);
1119 
1120 	/*
1121 	 * If max_size was specified, then min_size must be smaller
1122 	 */
1123 	if (max_val_type > NO_SIZE &&
1124 	    pconfig->min_hpages > pconfig->max_hpages) {
1125 		pr_err("minimum size can not be greater than maximum size\n");
1126 		return -EINVAL;
1127 	}
1128 
1129 	return 0;
1130 
1131 bad_val:
1132 	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
1133  	return -EINVAL;
1134 }
1135 
1136 static int
1137 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
1138 {
1139 	int ret;
1140 	struct hugetlbfs_config config;
1141 	struct hugetlbfs_sb_info *sbinfo;
1142 
1143 	save_mount_options(sb, data);
1144 
1145 	config.max_hpages = -1; /* No limit on size by default */
1146 	config.nr_inodes = -1; /* No limit on number of inodes by default */
1147 	config.uid = current_fsuid();
1148 	config.gid = current_fsgid();
1149 	config.mode = 0755;
1150 	config.hstate = &default_hstate;
1151 	config.min_hpages = -1; /* No default minimum size */
1152 	ret = hugetlbfs_parse_options(data, &config);
1153 	if (ret)
1154 		return ret;
1155 
1156 	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1157 	if (!sbinfo)
1158 		return -ENOMEM;
1159 	sb->s_fs_info = sbinfo;
1160 	sbinfo->hstate = config.hstate;
1161 	spin_lock_init(&sbinfo->stat_lock);
1162 	sbinfo->max_inodes = config.nr_inodes;
1163 	sbinfo->free_inodes = config.nr_inodes;
1164 	sbinfo->spool = NULL;
1165 	/*
1166 	 * Allocate and initialize subpool if maximum or minimum size is
1167 	 * specified.  Any needed reservations (for minimim size) are taken
1168 	 * taken when the subpool is created.
1169 	 */
1170 	if (config.max_hpages != -1 || config.min_hpages != -1) {
1171 		sbinfo->spool = hugepage_new_subpool(config.hstate,
1172 							config.max_hpages,
1173 							config.min_hpages);
1174 		if (!sbinfo->spool)
1175 			goto out_free;
1176 	}
1177 	sb->s_maxbytes = MAX_LFS_FILESIZE;
1178 	sb->s_blocksize = huge_page_size(config.hstate);
1179 	sb->s_blocksize_bits = huge_page_shift(config.hstate);
1180 	sb->s_magic = HUGETLBFS_MAGIC;
1181 	sb->s_op = &hugetlbfs_ops;
1182 	sb->s_time_gran = 1;
1183 	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
1184 	if (!sb->s_root)
1185 		goto out_free;
1186 	return 0;
1187 out_free:
1188 	kfree(sbinfo->spool);
1189 	kfree(sbinfo);
1190 	return -ENOMEM;
1191 }
1192 
1193 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
1194 	int flags, const char *dev_name, void *data)
1195 {
1196 	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
1197 }
1198 
1199 static struct file_system_type hugetlbfs_fs_type = {
1200 	.name		= "hugetlbfs",
1201 	.mount		= hugetlbfs_mount,
1202 	.kill_sb	= kill_litter_super,
1203 };
1204 MODULE_ALIAS_FS("hugetlbfs");
1205 
1206 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1207 
1208 static int can_do_hugetlb_shm(void)
1209 {
1210 	kgid_t shm_group;
1211 	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
1212 	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
1213 }
1214 
1215 static int get_hstate_idx(int page_size_log)
1216 {
1217 	struct hstate *h = hstate_sizelog(page_size_log);
1218 
1219 	if (!h)
1220 		return -1;
1221 	return h - hstates;
1222 }
1223 
1224 static const struct dentry_operations anon_ops = {
1225 	.d_dname = simple_dname
1226 };
1227 
1228 /*
1229  * Note that size should be aligned to proper hugepage size in caller side,
1230  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1231  */
1232 struct file *hugetlb_file_setup(const char *name, size_t size,
1233 				vm_flags_t acctflag, struct user_struct **user,
1234 				int creat_flags, int page_size_log)
1235 {
1236 	struct file *file = ERR_PTR(-ENOMEM);
1237 	struct inode *inode;
1238 	struct path path;
1239 	struct super_block *sb;
1240 	struct qstr quick_string;
1241 	int hstate_idx;
1242 
1243 	hstate_idx = get_hstate_idx(page_size_log);
1244 	if (hstate_idx < 0)
1245 		return ERR_PTR(-ENODEV);
1246 
1247 	*user = NULL;
1248 	if (!hugetlbfs_vfsmount[hstate_idx])
1249 		return ERR_PTR(-ENOENT);
1250 
1251 	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1252 		*user = current_user();
1253 		if (user_shm_lock(size, *user)) {
1254 			task_lock(current);
1255 			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
1256 				current->comm, current->pid);
1257 			task_unlock(current);
1258 		} else {
1259 			*user = NULL;
1260 			return ERR_PTR(-EPERM);
1261 		}
1262 	}
1263 
1264 	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
1265 	quick_string.name = name;
1266 	quick_string.len = strlen(quick_string.name);
1267 	quick_string.hash = 0;
1268 	path.dentry = d_alloc_pseudo(sb, &quick_string);
1269 	if (!path.dentry)
1270 		goto out_shm_unlock;
1271 
1272 	d_set_d_op(path.dentry, &anon_ops);
1273 	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
1274 	file = ERR_PTR(-ENOSPC);
1275 	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
1276 	if (!inode)
1277 		goto out_dentry;
1278 	if (creat_flags == HUGETLB_SHMFS_INODE)
1279 		inode->i_flags |= S_PRIVATE;
1280 
1281 	file = ERR_PTR(-ENOMEM);
1282 	if (hugetlb_reserve_pages(inode, 0,
1283 			size >> huge_page_shift(hstate_inode(inode)), NULL,
1284 			acctflag))
1285 		goto out_inode;
1286 
1287 	d_instantiate(path.dentry, inode);
1288 	inode->i_size = size;
1289 	clear_nlink(inode);
1290 
1291 	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1292 			&hugetlbfs_file_operations);
1293 	if (IS_ERR(file))
1294 		goto out_dentry; /* inode is already attached */
1295 
1296 	return file;
1297 
1298 out_inode:
1299 	iput(inode);
1300 out_dentry:
1301 	path_put(&path);
1302 out_shm_unlock:
1303 	if (*user) {
1304 		user_shm_unlock(size, *user);
1305 		*user = NULL;
1306 	}
1307 	return file;
1308 }
1309 
1310 static int __init init_hugetlbfs_fs(void)
1311 {
1312 	struct hstate *h;
1313 	int error;
1314 	int i;
1315 
1316 	if (!hugepages_supported()) {
1317 		pr_info("disabling because there are no supported hugepage sizes\n");
1318 		return -ENOTSUPP;
1319 	}
1320 
1321 	error = -ENOMEM;
1322 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1323 					sizeof(struct hugetlbfs_inode_info),
1324 					0, 0, init_once);
1325 	if (hugetlbfs_inode_cachep == NULL)
1326 		goto out2;
1327 
1328 	error = register_filesystem(&hugetlbfs_fs_type);
1329 	if (error)
1330 		goto out;
1331 
1332 	i = 0;
1333 	for_each_hstate(h) {
1334 		char buf[50];
1335 		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1336 
1337 		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1338 		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1339 							buf);
1340 
1341 		if (IS_ERR(hugetlbfs_vfsmount[i])) {
1342 			pr_err("Cannot mount internal hugetlbfs for "
1343 				"page size %uK", ps_kb);
1344 			error = PTR_ERR(hugetlbfs_vfsmount[i]);
1345 			hugetlbfs_vfsmount[i] = NULL;
1346 		}
1347 		i++;
1348 	}
1349 	/* Non default hstates are optional */
1350 	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1351 		return 0;
1352 
1353  out:
1354 	kmem_cache_destroy(hugetlbfs_inode_cachep);
1355  out2:
1356 	return error;
1357 }
1358 
1359 static void __exit exit_hugetlbfs_fs(void)
1360 {
1361 	struct hstate *h;
1362 	int i;
1363 
1364 
1365 	/*
1366 	 * Make sure all delayed rcu free inodes are flushed before we
1367 	 * destroy cache.
1368 	 */
1369 	rcu_barrier();
1370 	kmem_cache_destroy(hugetlbfs_inode_cachep);
1371 	i = 0;
1372 	for_each_hstate(h)
1373 		kern_unmount(hugetlbfs_vfsmount[i++]);
1374 	unregister_filesystem(&hugetlbfs_fs_type);
1375 }
1376 
1377 module_init(init_hugetlbfs_fs)
1378 module_exit(exit_hugetlbfs_fs)
1379 
1380 MODULE_LICENSE("GPL");
1381