xref: /openbmc/linux/fs/hugetlbfs/inode.c (revision 87c2ce3b)
1 /*
2  * hugetlbpage-backed filesystem.  Based on ramfs.
3  *
4  * William Irwin, 2002
5  *
6  * Copyright (C) 2002 Linus Torvalds.
7  */
8 
9 #include <linux/module.h>
10 #include <linux/thread_info.h>
11 #include <asm/current.h>
12 #include <linux/sched.h>		/* remove ASAP */
13 #include <linux/fs.h>
14 #include <linux/mount.h>
15 #include <linux/file.h>
16 #include <linux/writeback.h>
17 #include <linux/pagemap.h>
18 #include <linux/highmem.h>
19 #include <linux/init.h>
20 #include <linux/string.h>
21 #include <linux/backing-dev.h>
22 #include <linux/hugetlb.h>
23 #include <linux/pagevec.h>
24 #include <linux/quotaops.h>
25 #include <linux/slab.h>
26 #include <linux/dnotify.h>
27 #include <linux/statfs.h>
28 #include <linux/security.h>
29 
30 #include <asm/uaccess.h>
31 
32 /* some random number */
33 #define HUGETLBFS_MAGIC	0x958458f6
34 
35 static struct super_operations hugetlbfs_ops;
36 static struct address_space_operations hugetlbfs_aops;
37 struct file_operations hugetlbfs_file_operations;
38 static struct inode_operations hugetlbfs_dir_inode_operations;
39 static struct inode_operations hugetlbfs_inode_operations;
40 
41 static struct backing_dev_info hugetlbfs_backing_dev_info = {
42 	.ra_pages	= 0,	/* No readahead */
43 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
44 };
45 
46 int sysctl_hugetlb_shm_group;
47 
48 static void huge_pagevec_release(struct pagevec *pvec)
49 {
50 	int i;
51 
52 	for (i = 0; i < pagevec_count(pvec); ++i)
53 		put_page(pvec->pages[i]);
54 
55 	pagevec_reinit(pvec);
56 }
57 
58 /*
59  * huge_pages_needed tries to determine the number of new huge pages that
60  * will be required to fully populate this VMA.  This will be equal to
61  * the size of the VMA in huge pages minus the number of huge pages
62  * (covered by this VMA) that are found in the page cache.
63  *
64  * Result is in bytes to be compatible with is_hugepage_mem_enough()
65  */
66 static unsigned long
67 huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
68 {
69 	int i;
70 	struct pagevec pvec;
71 	unsigned long start = vma->vm_start;
72 	unsigned long end = vma->vm_end;
73 	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
74 	pgoff_t next = vma->vm_pgoff;
75 	pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
76 
77 	pagevec_init(&pvec, 0);
78 	while (next < endpg) {
79 		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
80 			break;
81 		for (i = 0; i < pagevec_count(&pvec); i++) {
82 			struct page *page = pvec.pages[i];
83 			if (page->index > next)
84 				next = page->index;
85 			if (page->index >= endpg)
86 				break;
87 			next++;
88 			hugepages--;
89 		}
90 		huge_pagevec_release(&pvec);
91 	}
92 	return hugepages << HPAGE_SHIFT;
93 }
94 
95 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
96 {
97 	struct inode *inode = file->f_dentry->d_inode;
98 	struct address_space *mapping = inode->i_mapping;
99 	unsigned long bytes;
100 	loff_t len, vma_len;
101 	int ret;
102 
103 	if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
104 		return -EINVAL;
105 
106 	if (vma->vm_start & ~HPAGE_MASK)
107 		return -EINVAL;
108 
109 	if (vma->vm_end & ~HPAGE_MASK)
110 		return -EINVAL;
111 
112 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
113 		return -EINVAL;
114 
115 	bytes = huge_pages_needed(mapping, vma);
116 	if (!is_hugepage_mem_enough(bytes))
117 		return -ENOMEM;
118 
119 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
120 
121 	mutex_lock(&inode->i_mutex);
122 	file_accessed(file);
123 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
124 	vma->vm_ops = &hugetlb_vm_ops;
125 
126 	ret = -ENOMEM;
127 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
128 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
129 		goto out;
130 
131 	ret = 0;
132 	hugetlb_prefault_arch_hook(vma->vm_mm);
133 	if (inode->i_size < len)
134 		inode->i_size = len;
135 out:
136 	mutex_unlock(&inode->i_mutex);
137 
138 	return ret;
139 }
140 
141 /*
142  * Called under down_write(mmap_sem).
143  */
144 
145 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
146 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
147 		unsigned long len, unsigned long pgoff, unsigned long flags);
148 #else
149 static unsigned long
150 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
151 		unsigned long len, unsigned long pgoff, unsigned long flags)
152 {
153 	struct mm_struct *mm = current->mm;
154 	struct vm_area_struct *vma;
155 	unsigned long start_addr;
156 
157 	if (len & ~HPAGE_MASK)
158 		return -EINVAL;
159 	if (len > TASK_SIZE)
160 		return -ENOMEM;
161 
162 	if (addr) {
163 		addr = ALIGN(addr, HPAGE_SIZE);
164 		vma = find_vma(mm, addr);
165 		if (TASK_SIZE - len >= addr &&
166 		    (!vma || addr + len <= vma->vm_start))
167 			return addr;
168 	}
169 
170 	start_addr = mm->free_area_cache;
171 
172 	if (len <= mm->cached_hole_size)
173 		start_addr = TASK_UNMAPPED_BASE;
174 
175 full_search:
176 	addr = ALIGN(start_addr, HPAGE_SIZE);
177 
178 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
179 		/* At this point:  (!vma || addr < vma->vm_end). */
180 		if (TASK_SIZE - len < addr) {
181 			/*
182 			 * Start a new search - just in case we missed
183 			 * some holes.
184 			 */
185 			if (start_addr != TASK_UNMAPPED_BASE) {
186 				start_addr = TASK_UNMAPPED_BASE;
187 				goto full_search;
188 			}
189 			return -ENOMEM;
190 		}
191 
192 		if (!vma || addr + len <= vma->vm_start)
193 			return addr;
194 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
195 	}
196 }
197 #endif
198 
199 /*
200  * Read a page. Again trivial. If it didn't already exist
201  * in the page cache, it is zero-filled.
202  */
203 static int hugetlbfs_readpage(struct file *file, struct page * page)
204 {
205 	unlock_page(page);
206 	return -EINVAL;
207 }
208 
209 static int hugetlbfs_prepare_write(struct file *file,
210 			struct page *page, unsigned offset, unsigned to)
211 {
212 	return -EINVAL;
213 }
214 
215 static int hugetlbfs_commit_write(struct file *file,
216 			struct page *page, unsigned offset, unsigned to)
217 {
218 	return -EINVAL;
219 }
220 
221 static void truncate_huge_page(struct page *page)
222 {
223 	clear_page_dirty(page);
224 	ClearPageUptodate(page);
225 	remove_from_page_cache(page);
226 	put_page(page);
227 }
228 
229 static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
230 {
231 	const pgoff_t start = lstart >> HPAGE_SHIFT;
232 	struct pagevec pvec;
233 	pgoff_t next;
234 	int i;
235 
236 	pagevec_init(&pvec, 0);
237 	next = start;
238 	while (1) {
239 		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
240 			if (next == start)
241 				break;
242 			next = start;
243 			continue;
244 		}
245 
246 		for (i = 0; i < pagevec_count(&pvec); ++i) {
247 			struct page *page = pvec.pages[i];
248 
249 			lock_page(page);
250 			if (page->index > next)
251 				next = page->index;
252 			++next;
253 			truncate_huge_page(page);
254 			unlock_page(page);
255 			hugetlb_put_quota(mapping);
256 		}
257 		huge_pagevec_release(&pvec);
258 	}
259 	BUG_ON(!lstart && mapping->nrpages);
260 }
261 
262 static void hugetlbfs_delete_inode(struct inode *inode)
263 {
264 	if (inode->i_data.nrpages)
265 		truncate_hugepages(&inode->i_data, 0);
266 	clear_inode(inode);
267 }
268 
269 static void hugetlbfs_forget_inode(struct inode *inode)
270 {
271 	struct super_block *sb = inode->i_sb;
272 
273 	if (!hlist_unhashed(&inode->i_hash)) {
274 		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
275 			list_move(&inode->i_list, &inode_unused);
276 		inodes_stat.nr_unused++;
277 		if (!sb || (sb->s_flags & MS_ACTIVE)) {
278 			spin_unlock(&inode_lock);
279 			return;
280 		}
281 		inode->i_state |= I_WILL_FREE;
282 		spin_unlock(&inode_lock);
283 		/*
284 		 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
285 		 * in our backing_dev_info.
286 		 */
287 		write_inode_now(inode, 1);
288 		spin_lock(&inode_lock);
289 		inode->i_state &= ~I_WILL_FREE;
290 		inodes_stat.nr_unused--;
291 		hlist_del_init(&inode->i_hash);
292 	}
293 	list_del_init(&inode->i_list);
294 	list_del_init(&inode->i_sb_list);
295 	inode->i_state |= I_FREEING;
296 	inodes_stat.nr_inodes--;
297 	spin_unlock(&inode_lock);
298 	if (inode->i_data.nrpages)
299 		truncate_hugepages(&inode->i_data, 0);
300 	clear_inode(inode);
301 	destroy_inode(inode);
302 }
303 
304 static void hugetlbfs_drop_inode(struct inode *inode)
305 {
306 	if (!inode->i_nlink)
307 		generic_delete_inode(inode);
308 	else
309 		hugetlbfs_forget_inode(inode);
310 }
311 
312 /*
313  * h_pgoff is in HPAGE_SIZE units.
314  * vma->vm_pgoff is in PAGE_SIZE units.
315  */
316 static inline void
317 hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
318 {
319 	struct vm_area_struct *vma;
320 	struct prio_tree_iter iter;
321 
322 	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
323 		unsigned long h_vm_pgoff;
324 		unsigned long v_offset;
325 
326 		h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
327 		v_offset = (h_pgoff - h_vm_pgoff) << HPAGE_SHIFT;
328 		/*
329 		 * Is this VMA fully outside the truncation point?
330 		 */
331 		if (h_vm_pgoff >= h_pgoff)
332 			v_offset = 0;
333 
334 		unmap_hugepage_range(vma,
335 				vma->vm_start + v_offset, vma->vm_end);
336 	}
337 }
338 
339 /*
340  * Expanding truncates are not allowed.
341  */
342 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
343 {
344 	unsigned long pgoff;
345 	struct address_space *mapping = inode->i_mapping;
346 
347 	if (offset > inode->i_size)
348 		return -EINVAL;
349 
350 	BUG_ON(offset & ~HPAGE_MASK);
351 	pgoff = offset >> HPAGE_SHIFT;
352 
353 	inode->i_size = offset;
354 	spin_lock(&mapping->i_mmap_lock);
355 	if (!prio_tree_empty(&mapping->i_mmap))
356 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
357 	spin_unlock(&mapping->i_mmap_lock);
358 	truncate_hugepages(mapping, offset);
359 	return 0;
360 }
361 
362 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
363 {
364 	struct inode *inode = dentry->d_inode;
365 	int error;
366 	unsigned int ia_valid = attr->ia_valid;
367 
368 	BUG_ON(!inode);
369 
370 	error = inode_change_ok(inode, attr);
371 	if (error)
372 		goto out;
373 
374 	if (ia_valid & ATTR_SIZE) {
375 		error = -EINVAL;
376 		if (!(attr->ia_size & ~HPAGE_MASK))
377 			error = hugetlb_vmtruncate(inode, attr->ia_size);
378 		if (error)
379 			goto out;
380 		attr->ia_valid &= ~ATTR_SIZE;
381 	}
382 	error = inode_setattr(inode, attr);
383 out:
384 	return error;
385 }
386 
387 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
388 					gid_t gid, int mode, dev_t dev)
389 {
390 	struct inode *inode;
391 
392 	inode = new_inode(sb);
393 	if (inode) {
394 		struct hugetlbfs_inode_info *info;
395 		inode->i_mode = mode;
396 		inode->i_uid = uid;
397 		inode->i_gid = gid;
398 		inode->i_blksize = HPAGE_SIZE;
399 		inode->i_blocks = 0;
400 		inode->i_mapping->a_ops = &hugetlbfs_aops;
401 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
402 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
403 		info = HUGETLBFS_I(inode);
404 		mpol_shared_policy_init(&info->policy);
405 		switch (mode & S_IFMT) {
406 		default:
407 			init_special_inode(inode, mode, dev);
408 			break;
409 		case S_IFREG:
410 			inode->i_op = &hugetlbfs_inode_operations;
411 			inode->i_fop = &hugetlbfs_file_operations;
412 			break;
413 		case S_IFDIR:
414 			inode->i_op = &hugetlbfs_dir_inode_operations;
415 			inode->i_fop = &simple_dir_operations;
416 
417 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
418 			inode->i_nlink++;
419 			break;
420 		case S_IFLNK:
421 			inode->i_op = &page_symlink_inode_operations;
422 			break;
423 		}
424 	}
425 	return inode;
426 }
427 
428 /*
429  * File creation. Allocate an inode, and we're done..
430  */
431 static int hugetlbfs_mknod(struct inode *dir,
432 			struct dentry *dentry, int mode, dev_t dev)
433 {
434 	struct inode *inode;
435 	int error = -ENOSPC;
436 	gid_t gid;
437 
438 	if (dir->i_mode & S_ISGID) {
439 		gid = dir->i_gid;
440 		if (S_ISDIR(mode))
441 			mode |= S_ISGID;
442 	} else {
443 		gid = current->fsgid;
444 	}
445 	inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, gid, mode, dev);
446 	if (inode) {
447 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
448 		d_instantiate(dentry, inode);
449 		dget(dentry);	/* Extra count - pin the dentry in core */
450 		error = 0;
451 	}
452 	return error;
453 }
454 
455 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
456 {
457 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
458 	if (!retval)
459 		dir->i_nlink++;
460 	return retval;
461 }
462 
463 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
464 {
465 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
466 }
467 
468 static int hugetlbfs_symlink(struct inode *dir,
469 			struct dentry *dentry, const char *symname)
470 {
471 	struct inode *inode;
472 	int error = -ENOSPC;
473 	gid_t gid;
474 
475 	if (dir->i_mode & S_ISGID)
476 		gid = dir->i_gid;
477 	else
478 		gid = current->fsgid;
479 
480 	inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid,
481 					gid, S_IFLNK|S_IRWXUGO, 0);
482 	if (inode) {
483 		int l = strlen(symname)+1;
484 		error = page_symlink(inode, symname, l);
485 		if (!error) {
486 			d_instantiate(dentry, inode);
487 			dget(dentry);
488 		} else
489 			iput(inode);
490 	}
491 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
492 
493 	return error;
494 }
495 
496 /*
497  * For direct-IO reads into hugetlb pages
498  */
499 static int hugetlbfs_set_page_dirty(struct page *page)
500 {
501 	return 0;
502 }
503 
504 static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
505 {
506 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
507 
508 	buf->f_type = HUGETLBFS_MAGIC;
509 	buf->f_bsize = HPAGE_SIZE;
510 	if (sbinfo) {
511 		spin_lock(&sbinfo->stat_lock);
512 		/* If no limits set, just report 0 for max/free/used
513 		 * blocks, like simple_statfs() */
514 		if (sbinfo->max_blocks >= 0) {
515 			buf->f_blocks = sbinfo->max_blocks;
516 			buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
517 			buf->f_files = sbinfo->max_inodes;
518 			buf->f_ffree = sbinfo->free_inodes;
519 		}
520 		spin_unlock(&sbinfo->stat_lock);
521 	}
522 	buf->f_namelen = NAME_MAX;
523 	return 0;
524 }
525 
526 static void hugetlbfs_put_super(struct super_block *sb)
527 {
528 	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
529 
530 	if (sbi) {
531 		sb->s_fs_info = NULL;
532 		kfree(sbi);
533 	}
534 }
535 
536 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
537 {
538 	if (sbinfo->free_inodes >= 0) {
539 		spin_lock(&sbinfo->stat_lock);
540 		if (unlikely(!sbinfo->free_inodes)) {
541 			spin_unlock(&sbinfo->stat_lock);
542 			return 0;
543 		}
544 		sbinfo->free_inodes--;
545 		spin_unlock(&sbinfo->stat_lock);
546 	}
547 
548 	return 1;
549 }
550 
551 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
552 {
553 	if (sbinfo->free_inodes >= 0) {
554 		spin_lock(&sbinfo->stat_lock);
555 		sbinfo->free_inodes++;
556 		spin_unlock(&sbinfo->stat_lock);
557 	}
558 }
559 
560 
561 static kmem_cache_t *hugetlbfs_inode_cachep;
562 
563 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
564 {
565 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
566 	struct hugetlbfs_inode_info *p;
567 
568 	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
569 		return NULL;
570 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
571 	if (unlikely(!p)) {
572 		hugetlbfs_inc_free_inodes(sbinfo);
573 		return NULL;
574 	}
575 	return &p->vfs_inode;
576 }
577 
578 static void hugetlbfs_destroy_inode(struct inode *inode)
579 {
580 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
581 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
582 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
583 }
584 
585 static struct address_space_operations hugetlbfs_aops = {
586 	.readpage	= hugetlbfs_readpage,
587 	.prepare_write	= hugetlbfs_prepare_write,
588 	.commit_write	= hugetlbfs_commit_write,
589 	.set_page_dirty	= hugetlbfs_set_page_dirty,
590 };
591 
592 
593 static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
594 {
595 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
596 
597 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
598 	    SLAB_CTOR_CONSTRUCTOR)
599 		inode_init_once(&ei->vfs_inode);
600 }
601 
602 struct file_operations hugetlbfs_file_operations = {
603 	.mmap			= hugetlbfs_file_mmap,
604 	.fsync			= simple_sync_file,
605 	.get_unmapped_area	= hugetlb_get_unmapped_area,
606 };
607 
608 static struct inode_operations hugetlbfs_dir_inode_operations = {
609 	.create		= hugetlbfs_create,
610 	.lookup		= simple_lookup,
611 	.link		= simple_link,
612 	.unlink		= simple_unlink,
613 	.symlink	= hugetlbfs_symlink,
614 	.mkdir		= hugetlbfs_mkdir,
615 	.rmdir		= simple_rmdir,
616 	.mknod		= hugetlbfs_mknod,
617 	.rename		= simple_rename,
618 	.setattr	= hugetlbfs_setattr,
619 };
620 
621 static struct inode_operations hugetlbfs_inode_operations = {
622 	.setattr	= hugetlbfs_setattr,
623 };
624 
625 static struct super_operations hugetlbfs_ops = {
626 	.alloc_inode    = hugetlbfs_alloc_inode,
627 	.destroy_inode  = hugetlbfs_destroy_inode,
628 	.statfs		= hugetlbfs_statfs,
629 	.delete_inode	= hugetlbfs_delete_inode,
630 	.drop_inode	= hugetlbfs_drop_inode,
631 	.put_super	= hugetlbfs_put_super,
632 };
633 
634 static int
635 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
636 {
637 	char *opt, *value, *rest;
638 
639 	if (!options)
640 		return 0;
641 	while ((opt = strsep(&options, ",")) != NULL) {
642 		if (!*opt)
643 			continue;
644 
645 		value = strchr(opt, '=');
646 		if (!value || !*value)
647 			return -EINVAL;
648 		else
649 			*value++ = '\0';
650 
651 		if (!strcmp(opt, "uid"))
652 			pconfig->uid = simple_strtoul(value, &value, 0);
653 		else if (!strcmp(opt, "gid"))
654 			pconfig->gid = simple_strtoul(value, &value, 0);
655 		else if (!strcmp(opt, "mode"))
656 			pconfig->mode = simple_strtoul(value,&value,0) & 0777U;
657 		else if (!strcmp(opt, "size")) {
658 			unsigned long long size = memparse(value, &rest);
659 			if (*rest == '%') {
660 				size <<= HPAGE_SHIFT;
661 				size *= max_huge_pages;
662 				do_div(size, 100);
663 				rest++;
664 			}
665 			size &= HPAGE_MASK;
666 			pconfig->nr_blocks = (size >> HPAGE_SHIFT);
667 			value = rest;
668 		} else if (!strcmp(opt,"nr_inodes")) {
669 			pconfig->nr_inodes = memparse(value, &rest);
670 			value = rest;
671 		} else
672 			return -EINVAL;
673 
674 		if (*value)
675 			return -EINVAL;
676 	}
677 	return 0;
678 }
679 
680 static int
681 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
682 {
683 	struct inode * inode;
684 	struct dentry * root;
685 	int ret;
686 	struct hugetlbfs_config config;
687 	struct hugetlbfs_sb_info *sbinfo;
688 
689 	config.nr_blocks = -1; /* No limit on size by default */
690 	config.nr_inodes = -1; /* No limit on number of inodes by default */
691 	config.uid = current->fsuid;
692 	config.gid = current->fsgid;
693 	config.mode = 0755;
694 	ret = hugetlbfs_parse_options(data, &config);
695 
696 	if (ret)
697 		return ret;
698 
699 	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
700 	if (!sbinfo)
701 		return -ENOMEM;
702 	sb->s_fs_info = sbinfo;
703 	spin_lock_init(&sbinfo->stat_lock);
704 	sbinfo->max_blocks = config.nr_blocks;
705 	sbinfo->free_blocks = config.nr_blocks;
706 	sbinfo->max_inodes = config.nr_inodes;
707 	sbinfo->free_inodes = config.nr_inodes;
708 	sb->s_maxbytes = MAX_LFS_FILESIZE;
709 	sb->s_blocksize = HPAGE_SIZE;
710 	sb->s_blocksize_bits = HPAGE_SHIFT;
711 	sb->s_magic = HUGETLBFS_MAGIC;
712 	sb->s_op = &hugetlbfs_ops;
713 	sb->s_time_gran = 1;
714 	inode = hugetlbfs_get_inode(sb, config.uid, config.gid,
715 					S_IFDIR | config.mode, 0);
716 	if (!inode)
717 		goto out_free;
718 
719 	root = d_alloc_root(inode);
720 	if (!root) {
721 		iput(inode);
722 		goto out_free;
723 	}
724 	sb->s_root = root;
725 	return 0;
726 out_free:
727 	kfree(sbinfo);
728 	return -ENOMEM;
729 }
730 
731 int hugetlb_get_quota(struct address_space *mapping)
732 {
733 	int ret = 0;
734 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
735 
736 	if (sbinfo->free_blocks > -1) {
737 		spin_lock(&sbinfo->stat_lock);
738 		if (sbinfo->free_blocks > 0)
739 			sbinfo->free_blocks--;
740 		else
741 			ret = -ENOMEM;
742 		spin_unlock(&sbinfo->stat_lock);
743 	}
744 
745 	return ret;
746 }
747 
748 void hugetlb_put_quota(struct address_space *mapping)
749 {
750 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
751 
752 	if (sbinfo->free_blocks > -1) {
753 		spin_lock(&sbinfo->stat_lock);
754 		sbinfo->free_blocks++;
755 		spin_unlock(&sbinfo->stat_lock);
756 	}
757 }
758 
759 static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
760 	int flags, const char *dev_name, void *data)
761 {
762 	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
763 }
764 
765 static struct file_system_type hugetlbfs_fs_type = {
766 	.name		= "hugetlbfs",
767 	.get_sb		= hugetlbfs_get_sb,
768 	.kill_sb	= kill_litter_super,
769 };
770 
771 static struct vfsmount *hugetlbfs_vfsmount;
772 
773 /*
774  * Return the next identifier for a shm file
775  */
776 static unsigned long hugetlbfs_counter(void)
777 {
778 	static DEFINE_SPINLOCK(lock);
779 	static unsigned long counter;
780 	unsigned long ret;
781 
782 	spin_lock(&lock);
783 	ret = ++counter;
784 	spin_unlock(&lock);
785 	return ret;
786 }
787 
788 static int can_do_hugetlb_shm(void)
789 {
790 	return likely(capable(CAP_IPC_LOCK) ||
791 			in_group_p(sysctl_hugetlb_shm_group) ||
792 			can_do_mlock());
793 }
794 
795 struct file *hugetlb_zero_setup(size_t size)
796 {
797 	int error = -ENOMEM;
798 	struct file *file;
799 	struct inode *inode;
800 	struct dentry *dentry, *root;
801 	struct qstr quick_string;
802 	char buf[16];
803 
804 	if (!can_do_hugetlb_shm())
805 		return ERR_PTR(-EPERM);
806 
807 	if (!is_hugepage_mem_enough(size))
808 		return ERR_PTR(-ENOMEM);
809 
810 	if (!user_shm_lock(size, current->user))
811 		return ERR_PTR(-ENOMEM);
812 
813 	root = hugetlbfs_vfsmount->mnt_root;
814 	snprintf(buf, 16, "%lu", hugetlbfs_counter());
815 	quick_string.name = buf;
816 	quick_string.len = strlen(quick_string.name);
817 	quick_string.hash = 0;
818 	dentry = d_alloc(root, &quick_string);
819 	if (!dentry)
820 		goto out_shm_unlock;
821 
822 	error = -ENFILE;
823 	file = get_empty_filp();
824 	if (!file)
825 		goto out_dentry;
826 
827 	error = -ENOSPC;
828 	inode = hugetlbfs_get_inode(root->d_sb, current->fsuid,
829 				current->fsgid, S_IFREG | S_IRWXUGO, 0);
830 	if (!inode)
831 		goto out_file;
832 
833 	d_instantiate(dentry, inode);
834 	inode->i_size = size;
835 	inode->i_nlink = 0;
836 	file->f_vfsmnt = mntget(hugetlbfs_vfsmount);
837 	file->f_dentry = dentry;
838 	file->f_mapping = inode->i_mapping;
839 	file->f_op = &hugetlbfs_file_operations;
840 	file->f_mode = FMODE_WRITE | FMODE_READ;
841 	return file;
842 
843 out_file:
844 	put_filp(file);
845 out_dentry:
846 	dput(dentry);
847 out_shm_unlock:
848 	user_shm_unlock(size, current->user);
849 	return ERR_PTR(error);
850 }
851 
852 static int __init init_hugetlbfs_fs(void)
853 {
854 	int error;
855 	struct vfsmount *vfsmount;
856 
857 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
858 					sizeof(struct hugetlbfs_inode_info),
859 					0, 0, init_once, NULL);
860 	if (hugetlbfs_inode_cachep == NULL)
861 		return -ENOMEM;
862 
863 	error = register_filesystem(&hugetlbfs_fs_type);
864 	if (error)
865 		goto out;
866 
867 	vfsmount = kern_mount(&hugetlbfs_fs_type);
868 
869 	if (!IS_ERR(vfsmount)) {
870 		hugetlbfs_vfsmount = vfsmount;
871 		return 0;
872 	}
873 
874 	error = PTR_ERR(vfsmount);
875 
876  out:
877 	if (error)
878 		kmem_cache_destroy(hugetlbfs_inode_cachep);
879 	return error;
880 }
881 
882 static void __exit exit_hugetlbfs_fs(void)
883 {
884 	kmem_cache_destroy(hugetlbfs_inode_cachep);
885 	unregister_filesystem(&hugetlbfs_fs_type);
886 }
887 
888 module_init(init_hugetlbfs_fs)
889 module_exit(exit_hugetlbfs_fs)
890 
891 MODULE_LICENSE("GPL");
892