xref: /openbmc/linux/fs/namei.c (revision a1e58bbd)
1 /*
2  *  linux/fs/namei.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 /*
8  * Some corrections by tytso.
9  */
10 
11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12  * lookup logic.
13  */
14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15  */
16 
17 #include <linux/init.h>
18 #include <linux/module.h>
19 #include <linux/slab.h>
20 #include <linux/fs.h>
21 #include <linux/namei.h>
22 #include <linux/quotaops.h>
23 #include <linux/pagemap.h>
24 #include <linux/fsnotify.h>
25 #include <linux/personality.h>
26 #include <linux/security.h>
27 #include <linux/syscalls.h>
28 #include <linux/mount.h>
29 #include <linux/audit.h>
30 #include <linux/capability.h>
31 #include <linux/file.h>
32 #include <linux/fcntl.h>
33 #include <asm/namei.h>
34 #include <asm/uaccess.h>
35 
36 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
37 
38 /* [Feb-1997 T. Schoebel-Theuer]
39  * Fundamental changes in the pathname lookup mechanisms (namei)
40  * were necessary because of omirr.  The reason is that omirr needs
41  * to know the _real_ pathname, not the user-supplied one, in case
42  * of symlinks (and also when transname replacements occur).
43  *
44  * The new code replaces the old recursive symlink resolution with
45  * an iterative one (in case of non-nested symlink chains).  It does
46  * this with calls to <fs>_follow_link().
47  * As a side effect, dir_namei(), _namei() and follow_link() are now
48  * replaced with a single function lookup_dentry() that can handle all
49  * the special cases of the former code.
50  *
51  * With the new dcache, the pathname is stored at each inode, at least as
52  * long as the refcount of the inode is positive.  As a side effect, the
53  * size of the dcache depends on the inode cache and thus is dynamic.
54  *
55  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
56  * resolution to correspond with current state of the code.
57  *
58  * Note that the symlink resolution is not *completely* iterative.
59  * There is still a significant amount of tail- and mid- recursion in
60  * the algorithm.  Also, note that <fs>_readlink() is not used in
61  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
62  * may return different results than <fs>_follow_link().  Many virtual
63  * filesystems (including /proc) exhibit this behavior.
64  */
65 
66 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
67  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
68  * and the name already exists in form of a symlink, try to create the new
69  * name indicated by the symlink. The old code always complained that the
70  * name already exists, due to not following the symlink even if its target
71  * is nonexistent.  The new semantics affects also mknod() and link() when
72  * the name is a symlink pointing to a non-existant name.
73  *
74  * I don't know which semantics is the right one, since I have no access
75  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
76  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
77  * "old" one. Personally, I think the new semantics is much more logical.
78  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
79  * file does succeed in both HP-UX and SunOs, but not in Solaris
80  * and in the old Linux semantics.
81  */
82 
83 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
84  * semantics.  See the comments in "open_namei" and "do_link" below.
85  *
86  * [10-Sep-98 Alan Modra] Another symlink change.
87  */
88 
89 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
90  *	inside the path - always follow.
91  *	in the last component in creation/removal/renaming - never follow.
92  *	if LOOKUP_FOLLOW passed - follow.
93  *	if the pathname has trailing slashes - follow.
94  *	otherwise - don't follow.
95  * (applied in that order).
96  *
97  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
98  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
99  * During the 2.4 we need to fix the userland stuff depending on it -
100  * hopefully we will be able to get rid of that wart in 2.5. So far only
101  * XEmacs seems to be relying on it...
102  */
103 /*
104  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
105  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
106  * any extra contention...
107  */
108 
109 static int __link_path_walk(const char *name, struct nameidata *nd);
110 
111 /* In order to reduce some races, while at the same time doing additional
112  * checking and hopefully speeding things up, we copy filenames to the
113  * kernel data space before using them..
114  *
115  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
116  * PATH_MAX includes the nul terminator --RR.
117  */
118 static int do_getname(const char __user *filename, char *page)
119 {
120 	int retval;
121 	unsigned long len = PATH_MAX;
122 
123 	if (!segment_eq(get_fs(), KERNEL_DS)) {
124 		if ((unsigned long) filename >= TASK_SIZE)
125 			return -EFAULT;
126 		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
127 			len = TASK_SIZE - (unsigned long) filename;
128 	}
129 
130 	retval = strncpy_from_user(page, filename, len);
131 	if (retval > 0) {
132 		if (retval < len)
133 			return 0;
134 		return -ENAMETOOLONG;
135 	} else if (!retval)
136 		retval = -ENOENT;
137 	return retval;
138 }
139 
140 char * getname(const char __user * filename)
141 {
142 	char *tmp, *result;
143 
144 	result = ERR_PTR(-ENOMEM);
145 	tmp = __getname();
146 	if (tmp)  {
147 		int retval = do_getname(filename, tmp);
148 
149 		result = tmp;
150 		if (retval < 0) {
151 			__putname(tmp);
152 			result = ERR_PTR(retval);
153 		}
154 	}
155 	audit_getname(result);
156 	return result;
157 }
158 
159 #ifdef CONFIG_AUDITSYSCALL
160 void putname(const char *name)
161 {
162 	if (unlikely(!audit_dummy_context()))
163 		audit_putname(name);
164 	else
165 		__putname(name);
166 }
167 EXPORT_SYMBOL(putname);
168 #endif
169 
170 
171 /**
172  * generic_permission  -  check for access rights on a Posix-like filesystem
173  * @inode:	inode to check access rights for
174  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
175  * @check_acl:	optional callback to check for Posix ACLs
176  *
177  * Used to check for read/write/execute permissions on a file.
178  * We use "fsuid" for this, letting us set arbitrary permissions
179  * for filesystem access without changing the "normal" uids which
180  * are used for other things..
181  */
182 int generic_permission(struct inode *inode, int mask,
183 		int (*check_acl)(struct inode *inode, int mask))
184 {
185 	umode_t			mode = inode->i_mode;
186 
187 	if (current->fsuid == inode->i_uid)
188 		mode >>= 6;
189 	else {
190 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
191 			int error = check_acl(inode, mask);
192 			if (error == -EACCES)
193 				goto check_capabilities;
194 			else if (error != -EAGAIN)
195 				return error;
196 		}
197 
198 		if (in_group_p(inode->i_gid))
199 			mode >>= 3;
200 	}
201 
202 	/*
203 	 * If the DACs are ok we don't need any capability check.
204 	 */
205 	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
206 		return 0;
207 
208  check_capabilities:
209 	/*
210 	 * Read/write DACs are always overridable.
211 	 * Executable DACs are overridable if at least one exec bit is set.
212 	 */
213 	if (!(mask & MAY_EXEC) ||
214 	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
215 		if (capable(CAP_DAC_OVERRIDE))
216 			return 0;
217 
218 	/*
219 	 * Searching includes executable on directories, else just read.
220 	 */
221 	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
222 		if (capable(CAP_DAC_READ_SEARCH))
223 			return 0;
224 
225 	return -EACCES;
226 }
227 
228 int permission(struct inode *inode, int mask, struct nameidata *nd)
229 {
230 	int retval, submask;
231 	struct vfsmount *mnt = NULL;
232 
233 	if (nd)
234 		mnt = nd->path.mnt;
235 
236 	if (mask & MAY_WRITE) {
237 		umode_t mode = inode->i_mode;
238 
239 		/*
240 		 * Nobody gets write access to a read-only fs.
241 		 */
242 		if (IS_RDONLY(inode) &&
243 		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
244 			return -EROFS;
245 
246 		/*
247 		 * Nobody gets write access to an immutable file.
248 		 */
249 		if (IS_IMMUTABLE(inode))
250 			return -EACCES;
251 	}
252 
253 	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
254 		/*
255 		 * MAY_EXEC on regular files is denied if the fs is mounted
256 		 * with the "noexec" flag.
257 		 */
258 		if (mnt && (mnt->mnt_flags & MNT_NOEXEC))
259 			return -EACCES;
260 	}
261 
262 	/* Ordinary permission routines do not understand MAY_APPEND. */
263 	submask = mask & ~MAY_APPEND;
264 	if (inode->i_op && inode->i_op->permission) {
265 		retval = inode->i_op->permission(inode, submask, nd);
266 		if (!retval) {
267 			/*
268 			 * Exec permission on a regular file is denied if none
269 			 * of the execute bits are set.
270 			 *
271 			 * This check should be done by the ->permission()
272 			 * method.
273 			 */
274 			if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode) &&
275 			    !(inode->i_mode & S_IXUGO))
276 				return -EACCES;
277 		}
278 	} else {
279 		retval = generic_permission(inode, submask, NULL);
280 	}
281 	if (retval)
282 		return retval;
283 
284 	return security_inode_permission(inode, mask, nd);
285 }
286 
287 /**
288  * vfs_permission  -  check for access rights to a given path
289  * @nd:		lookup result that describes the path
290  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
291  *
292  * Used to check for read/write/execute permissions on a path.
293  * We use "fsuid" for this, letting us set arbitrary permissions
294  * for filesystem access without changing the "normal" uids which
295  * are used for other things.
296  */
297 int vfs_permission(struct nameidata *nd, int mask)
298 {
299 	return permission(nd->path.dentry->d_inode, mask, nd);
300 }
301 
302 /**
303  * file_permission  -  check for additional access rights to a given file
304  * @file:	file to check access rights for
305  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
306  *
307  * Used to check for read/write/execute permissions on an already opened
308  * file.
309  *
310  * Note:
311  *	Do not use this function in new code.  All access checks should
312  *	be done using vfs_permission().
313  */
314 int file_permission(struct file *file, int mask)
315 {
316 	return permission(file->f_path.dentry->d_inode, mask, NULL);
317 }
318 
319 /*
320  * get_write_access() gets write permission for a file.
321  * put_write_access() releases this write permission.
322  * This is used for regular files.
323  * We cannot support write (and maybe mmap read-write shared) accesses and
324  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
325  * can have the following values:
326  * 0: no writers, no VM_DENYWRITE mappings
327  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
328  * > 0: (i_writecount) users are writing to the file.
329  *
330  * Normally we operate on that counter with atomic_{inc,dec} and it's safe
331  * except for the cases where we don't hold i_writecount yet. Then we need to
332  * use {get,deny}_write_access() - these functions check the sign and refuse
333  * to do the change if sign is wrong. Exclusion between them is provided by
334  * the inode->i_lock spinlock.
335  */
336 
337 int get_write_access(struct inode * inode)
338 {
339 	spin_lock(&inode->i_lock);
340 	if (atomic_read(&inode->i_writecount) < 0) {
341 		spin_unlock(&inode->i_lock);
342 		return -ETXTBSY;
343 	}
344 	atomic_inc(&inode->i_writecount);
345 	spin_unlock(&inode->i_lock);
346 
347 	return 0;
348 }
349 
350 int deny_write_access(struct file * file)
351 {
352 	struct inode *inode = file->f_path.dentry->d_inode;
353 
354 	spin_lock(&inode->i_lock);
355 	if (atomic_read(&inode->i_writecount) > 0) {
356 		spin_unlock(&inode->i_lock);
357 		return -ETXTBSY;
358 	}
359 	atomic_dec(&inode->i_writecount);
360 	spin_unlock(&inode->i_lock);
361 
362 	return 0;
363 }
364 
365 /**
366  * path_get - get a reference to a path
367  * @path: path to get the reference to
368  *
369  * Given a path increment the reference count to the dentry and the vfsmount.
370  */
371 void path_get(struct path *path)
372 {
373 	mntget(path->mnt);
374 	dget(path->dentry);
375 }
376 EXPORT_SYMBOL(path_get);
377 
378 /**
379  * path_put - put a reference to a path
380  * @path: path to put the reference to
381  *
382  * Given a path decrement the reference count to the dentry and the vfsmount.
383  */
384 void path_put(struct path *path)
385 {
386 	dput(path->dentry);
387 	mntput(path->mnt);
388 }
389 EXPORT_SYMBOL(path_put);
390 
391 /**
392  * release_open_intent - free up open intent resources
393  * @nd: pointer to nameidata
394  */
395 void release_open_intent(struct nameidata *nd)
396 {
397 	if (nd->intent.open.file->f_path.dentry == NULL)
398 		put_filp(nd->intent.open.file);
399 	else
400 		fput(nd->intent.open.file);
401 }
402 
403 static inline struct dentry *
404 do_revalidate(struct dentry *dentry, struct nameidata *nd)
405 {
406 	int status = dentry->d_op->d_revalidate(dentry, nd);
407 	if (unlikely(status <= 0)) {
408 		/*
409 		 * The dentry failed validation.
410 		 * If d_revalidate returned 0 attempt to invalidate
411 		 * the dentry otherwise d_revalidate is asking us
412 		 * to return a fail status.
413 		 */
414 		if (!status) {
415 			if (!d_invalidate(dentry)) {
416 				dput(dentry);
417 				dentry = NULL;
418 			}
419 		} else {
420 			dput(dentry);
421 			dentry = ERR_PTR(status);
422 		}
423 	}
424 	return dentry;
425 }
426 
427 /*
428  * Internal lookup() using the new generic dcache.
429  * SMP-safe
430  */
431 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
432 {
433 	struct dentry * dentry = __d_lookup(parent, name);
434 
435 	/* lockess __d_lookup may fail due to concurrent d_move()
436 	 * in some unrelated directory, so try with d_lookup
437 	 */
438 	if (!dentry)
439 		dentry = d_lookup(parent, name);
440 
441 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
442 		dentry = do_revalidate(dentry, nd);
443 
444 	return dentry;
445 }
446 
447 /*
448  * Short-cut version of permission(), for calling by
449  * path_walk(), when dcache lock is held.  Combines parts
450  * of permission() and generic_permission(), and tests ONLY for
451  * MAY_EXEC permission.
452  *
453  * If appropriate, check DAC only.  If not appropriate, or
454  * short-cut DAC fails, then call permission() to do more
455  * complete permission check.
456  */
457 static int exec_permission_lite(struct inode *inode,
458 				       struct nameidata *nd)
459 {
460 	umode_t	mode = inode->i_mode;
461 
462 	if (inode->i_op && inode->i_op->permission)
463 		return -EAGAIN;
464 
465 	if (current->fsuid == inode->i_uid)
466 		mode >>= 6;
467 	else if (in_group_p(inode->i_gid))
468 		mode >>= 3;
469 
470 	if (mode & MAY_EXEC)
471 		goto ok;
472 
473 	if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
474 		goto ok;
475 
476 	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
477 		goto ok;
478 
479 	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
480 		goto ok;
481 
482 	return -EACCES;
483 ok:
484 	return security_inode_permission(inode, MAY_EXEC, nd);
485 }
486 
487 /*
488  * This is called when everything else fails, and we actually have
489  * to go to the low-level filesystem to find out what we should do..
490  *
491  * We get the directory semaphore, and after getting that we also
492  * make sure that nobody added the entry to the dcache in the meantime..
493  * SMP-safe
494  */
495 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
496 {
497 	struct dentry * result;
498 	struct inode *dir = parent->d_inode;
499 
500 	mutex_lock(&dir->i_mutex);
501 	/*
502 	 * First re-do the cached lookup just in case it was created
503 	 * while we waited for the directory semaphore..
504 	 *
505 	 * FIXME! This could use version numbering or similar to
506 	 * avoid unnecessary cache lookups.
507 	 *
508 	 * The "dcache_lock" is purely to protect the RCU list walker
509 	 * from concurrent renames at this point (we mustn't get false
510 	 * negatives from the RCU list walk here, unlike the optimistic
511 	 * fast walk).
512 	 *
513 	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
514 	 */
515 	result = d_lookup(parent, name);
516 	if (!result) {
517 		struct dentry * dentry = d_alloc(parent, name);
518 		result = ERR_PTR(-ENOMEM);
519 		if (dentry) {
520 			result = dir->i_op->lookup(dir, dentry, nd);
521 			if (result)
522 				dput(dentry);
523 			else
524 				result = dentry;
525 		}
526 		mutex_unlock(&dir->i_mutex);
527 		return result;
528 	}
529 
530 	/*
531 	 * Uhhuh! Nasty case: the cache was re-populated while
532 	 * we waited on the semaphore. Need to revalidate.
533 	 */
534 	mutex_unlock(&dir->i_mutex);
535 	if (result->d_op && result->d_op->d_revalidate) {
536 		result = do_revalidate(result, nd);
537 		if (!result)
538 			result = ERR_PTR(-ENOENT);
539 	}
540 	return result;
541 }
542 
543 static int __emul_lookup_dentry(const char *, struct nameidata *);
544 
545 /* SMP-safe */
546 static __always_inline int
547 walk_init_root(const char *name, struct nameidata *nd)
548 {
549 	struct fs_struct *fs = current->fs;
550 
551 	read_lock(&fs->lock);
552 	if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
553 		nd->path = fs->altroot;
554 		path_get(&fs->altroot);
555 		read_unlock(&fs->lock);
556 		if (__emul_lookup_dentry(name,nd))
557 			return 0;
558 		read_lock(&fs->lock);
559 	}
560 	nd->path = fs->root;
561 	path_get(&fs->root);
562 	read_unlock(&fs->lock);
563 	return 1;
564 }
565 
566 /*
567  * Wrapper to retry pathname resolution whenever the underlying
568  * file system returns an ESTALE.
569  *
570  * Retry the whole path once, forcing real lookup requests
571  * instead of relying on the dcache.
572  */
573 static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
574 {
575 	struct path save = nd->path;
576 	int result;
577 
578 	/* make sure the stuff we saved doesn't go away */
579 	dget(save.dentry);
580 	mntget(save.mnt);
581 
582 	result = __link_path_walk(name, nd);
583 	if (result == -ESTALE) {
584 		/* nd->path had been dropped */
585 		nd->path = save;
586 		dget(nd->path.dentry);
587 		mntget(nd->path.mnt);
588 		nd->flags |= LOOKUP_REVAL;
589 		result = __link_path_walk(name, nd);
590 	}
591 
592 	path_put(&save);
593 
594 	return result;
595 }
596 
597 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
598 {
599 	int res = 0;
600 	char *name;
601 	if (IS_ERR(link))
602 		goto fail;
603 
604 	if (*link == '/') {
605 		path_put(&nd->path);
606 		if (!walk_init_root(link, nd))
607 			/* weird __emul_prefix() stuff did it */
608 			goto out;
609 	}
610 	res = link_path_walk(link, nd);
611 out:
612 	if (nd->depth || res || nd->last_type!=LAST_NORM)
613 		return res;
614 	/*
615 	 * If it is an iterative symlinks resolution in open_namei() we
616 	 * have to copy the last component. And all that crap because of
617 	 * bloody create() on broken symlinks. Furrfu...
618 	 */
619 	name = __getname();
620 	if (unlikely(!name)) {
621 		path_put(&nd->path);
622 		return -ENOMEM;
623 	}
624 	strcpy(name, nd->last.name);
625 	nd->last.name = name;
626 	return 0;
627 fail:
628 	path_put(&nd->path);
629 	return PTR_ERR(link);
630 }
631 
632 static void path_put_conditional(struct path *path, struct nameidata *nd)
633 {
634 	dput(path->dentry);
635 	if (path->mnt != nd->path.mnt)
636 		mntput(path->mnt);
637 }
638 
639 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
640 {
641 	dput(nd->path.dentry);
642 	if (nd->path.mnt != path->mnt)
643 		mntput(nd->path.mnt);
644 	nd->path.mnt = path->mnt;
645 	nd->path.dentry = path->dentry;
646 }
647 
648 static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
649 {
650 	int error;
651 	void *cookie;
652 	struct dentry *dentry = path->dentry;
653 
654 	touch_atime(path->mnt, dentry);
655 	nd_set_link(nd, NULL);
656 
657 	if (path->mnt != nd->path.mnt) {
658 		path_to_nameidata(path, nd);
659 		dget(dentry);
660 	}
661 	mntget(path->mnt);
662 	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
663 	error = PTR_ERR(cookie);
664 	if (!IS_ERR(cookie)) {
665 		char *s = nd_get_link(nd);
666 		error = 0;
667 		if (s)
668 			error = __vfs_follow_link(nd, s);
669 		if (dentry->d_inode->i_op->put_link)
670 			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
671 	}
672 	path_put(path);
673 
674 	return error;
675 }
676 
677 /*
678  * This limits recursive symlink follows to 8, while
679  * limiting consecutive symlinks to 40.
680  *
681  * Without that kind of total limit, nasty chains of consecutive
682  * symlinks can cause almost arbitrarily long lookups.
683  */
684 static inline int do_follow_link(struct path *path, struct nameidata *nd)
685 {
686 	int err = -ELOOP;
687 	if (current->link_count >= MAX_NESTED_LINKS)
688 		goto loop;
689 	if (current->total_link_count >= 40)
690 		goto loop;
691 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
692 	cond_resched();
693 	err = security_inode_follow_link(path->dentry, nd);
694 	if (err)
695 		goto loop;
696 	current->link_count++;
697 	current->total_link_count++;
698 	nd->depth++;
699 	err = __do_follow_link(path, nd);
700 	current->link_count--;
701 	nd->depth--;
702 	return err;
703 loop:
704 	path_put_conditional(path, nd);
705 	path_put(&nd->path);
706 	return err;
707 }
708 
709 int follow_up(struct vfsmount **mnt, struct dentry **dentry)
710 {
711 	struct vfsmount *parent;
712 	struct dentry *mountpoint;
713 	spin_lock(&vfsmount_lock);
714 	parent=(*mnt)->mnt_parent;
715 	if (parent == *mnt) {
716 		spin_unlock(&vfsmount_lock);
717 		return 0;
718 	}
719 	mntget(parent);
720 	mountpoint=dget((*mnt)->mnt_mountpoint);
721 	spin_unlock(&vfsmount_lock);
722 	dput(*dentry);
723 	*dentry = mountpoint;
724 	mntput(*mnt);
725 	*mnt = parent;
726 	return 1;
727 }
728 
729 /* no need for dcache_lock, as serialization is taken care in
730  * namespace.c
731  */
732 static int __follow_mount(struct path *path)
733 {
734 	int res = 0;
735 	while (d_mountpoint(path->dentry)) {
736 		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
737 		if (!mounted)
738 			break;
739 		dput(path->dentry);
740 		if (res)
741 			mntput(path->mnt);
742 		path->mnt = mounted;
743 		path->dentry = dget(mounted->mnt_root);
744 		res = 1;
745 	}
746 	return res;
747 }
748 
749 static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
750 {
751 	while (d_mountpoint(*dentry)) {
752 		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
753 		if (!mounted)
754 			break;
755 		dput(*dentry);
756 		mntput(*mnt);
757 		*mnt = mounted;
758 		*dentry = dget(mounted->mnt_root);
759 	}
760 }
761 
762 /* no need for dcache_lock, as serialization is taken care in
763  * namespace.c
764  */
765 int follow_down(struct vfsmount **mnt, struct dentry **dentry)
766 {
767 	struct vfsmount *mounted;
768 
769 	mounted = lookup_mnt(*mnt, *dentry);
770 	if (mounted) {
771 		dput(*dentry);
772 		mntput(*mnt);
773 		*mnt = mounted;
774 		*dentry = dget(mounted->mnt_root);
775 		return 1;
776 	}
777 	return 0;
778 }
779 
780 static __always_inline void follow_dotdot(struct nameidata *nd)
781 {
782 	struct fs_struct *fs = current->fs;
783 
784 	while(1) {
785 		struct vfsmount *parent;
786 		struct dentry *old = nd->path.dentry;
787 
788                 read_lock(&fs->lock);
789 		if (nd->path.dentry == fs->root.dentry &&
790 		    nd->path.mnt == fs->root.mnt) {
791                         read_unlock(&fs->lock);
792 			break;
793 		}
794                 read_unlock(&fs->lock);
795 		spin_lock(&dcache_lock);
796 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
797 			nd->path.dentry = dget(nd->path.dentry->d_parent);
798 			spin_unlock(&dcache_lock);
799 			dput(old);
800 			break;
801 		}
802 		spin_unlock(&dcache_lock);
803 		spin_lock(&vfsmount_lock);
804 		parent = nd->path.mnt->mnt_parent;
805 		if (parent == nd->path.mnt) {
806 			spin_unlock(&vfsmount_lock);
807 			break;
808 		}
809 		mntget(parent);
810 		nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
811 		spin_unlock(&vfsmount_lock);
812 		dput(old);
813 		mntput(nd->path.mnt);
814 		nd->path.mnt = parent;
815 	}
816 	follow_mount(&nd->path.mnt, &nd->path.dentry);
817 }
818 
819 /*
820  *  It's more convoluted than I'd like it to be, but... it's still fairly
821  *  small and for now I'd prefer to have fast path as straight as possible.
822  *  It _is_ time-critical.
823  */
824 static int do_lookup(struct nameidata *nd, struct qstr *name,
825 		     struct path *path)
826 {
827 	struct vfsmount *mnt = nd->path.mnt;
828 	struct dentry *dentry = __d_lookup(nd->path.dentry, name);
829 
830 	if (!dentry)
831 		goto need_lookup;
832 	if (dentry->d_op && dentry->d_op->d_revalidate)
833 		goto need_revalidate;
834 done:
835 	path->mnt = mnt;
836 	path->dentry = dentry;
837 	__follow_mount(path);
838 	return 0;
839 
840 need_lookup:
841 	dentry = real_lookup(nd->path.dentry, name, nd);
842 	if (IS_ERR(dentry))
843 		goto fail;
844 	goto done;
845 
846 need_revalidate:
847 	dentry = do_revalidate(dentry, nd);
848 	if (!dentry)
849 		goto need_lookup;
850 	if (IS_ERR(dentry))
851 		goto fail;
852 	goto done;
853 
854 fail:
855 	return PTR_ERR(dentry);
856 }
857 
858 /*
859  * Name resolution.
860  * This is the basic name resolution function, turning a pathname into
861  * the final dentry. We expect 'base' to be positive and a directory.
862  *
863  * Returns 0 and nd will have valid dentry and mnt on success.
864  * Returns error and drops reference to input namei data on failure.
865  */
866 static int __link_path_walk(const char *name, struct nameidata *nd)
867 {
868 	struct path next;
869 	struct inode *inode;
870 	int err;
871 	unsigned int lookup_flags = nd->flags;
872 
873 	while (*name=='/')
874 		name++;
875 	if (!*name)
876 		goto return_reval;
877 
878 	inode = nd->path.dentry->d_inode;
879 	if (nd->depth)
880 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
881 
882 	/* At this point we know we have a real path component. */
883 	for(;;) {
884 		unsigned long hash;
885 		struct qstr this;
886 		unsigned int c;
887 
888 		nd->flags |= LOOKUP_CONTINUE;
889 		err = exec_permission_lite(inode, nd);
890 		if (err == -EAGAIN)
891 			err = vfs_permission(nd, MAY_EXEC);
892  		if (err)
893 			break;
894 
895 		this.name = name;
896 		c = *(const unsigned char *)name;
897 
898 		hash = init_name_hash();
899 		do {
900 			name++;
901 			hash = partial_name_hash(c, hash);
902 			c = *(const unsigned char *)name;
903 		} while (c && (c != '/'));
904 		this.len = name - (const char *) this.name;
905 		this.hash = end_name_hash(hash);
906 
907 		/* remove trailing slashes? */
908 		if (!c)
909 			goto last_component;
910 		while (*++name == '/');
911 		if (!*name)
912 			goto last_with_slashes;
913 
914 		/*
915 		 * "." and ".." are special - ".." especially so because it has
916 		 * to be able to know about the current root directory and
917 		 * parent relationships.
918 		 */
919 		if (this.name[0] == '.') switch (this.len) {
920 			default:
921 				break;
922 			case 2:
923 				if (this.name[1] != '.')
924 					break;
925 				follow_dotdot(nd);
926 				inode = nd->path.dentry->d_inode;
927 				/* fallthrough */
928 			case 1:
929 				continue;
930 		}
931 		/*
932 		 * See if the low-level filesystem might want
933 		 * to use its own hash..
934 		 */
935 		if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
936 			err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
937 							    &this);
938 			if (err < 0)
939 				break;
940 		}
941 		/* This does the actual lookups.. */
942 		err = do_lookup(nd, &this, &next);
943 		if (err)
944 			break;
945 
946 		err = -ENOENT;
947 		inode = next.dentry->d_inode;
948 		if (!inode)
949 			goto out_dput;
950 		err = -ENOTDIR;
951 		if (!inode->i_op)
952 			goto out_dput;
953 
954 		if (inode->i_op->follow_link) {
955 			err = do_follow_link(&next, nd);
956 			if (err)
957 				goto return_err;
958 			err = -ENOENT;
959 			inode = nd->path.dentry->d_inode;
960 			if (!inode)
961 				break;
962 			err = -ENOTDIR;
963 			if (!inode->i_op)
964 				break;
965 		} else
966 			path_to_nameidata(&next, nd);
967 		err = -ENOTDIR;
968 		if (!inode->i_op->lookup)
969 			break;
970 		continue;
971 		/* here ends the main loop */
972 
973 last_with_slashes:
974 		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
975 last_component:
976 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
977 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
978 		if (lookup_flags & LOOKUP_PARENT)
979 			goto lookup_parent;
980 		if (this.name[0] == '.') switch (this.len) {
981 			default:
982 				break;
983 			case 2:
984 				if (this.name[1] != '.')
985 					break;
986 				follow_dotdot(nd);
987 				inode = nd->path.dentry->d_inode;
988 				/* fallthrough */
989 			case 1:
990 				goto return_reval;
991 		}
992 		if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
993 			err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
994 							    &this);
995 			if (err < 0)
996 				break;
997 		}
998 		err = do_lookup(nd, &this, &next);
999 		if (err)
1000 			break;
1001 		inode = next.dentry->d_inode;
1002 		if ((lookup_flags & LOOKUP_FOLLOW)
1003 		    && inode && inode->i_op && inode->i_op->follow_link) {
1004 			err = do_follow_link(&next, nd);
1005 			if (err)
1006 				goto return_err;
1007 			inode = nd->path.dentry->d_inode;
1008 		} else
1009 			path_to_nameidata(&next, nd);
1010 		err = -ENOENT;
1011 		if (!inode)
1012 			break;
1013 		if (lookup_flags & LOOKUP_DIRECTORY) {
1014 			err = -ENOTDIR;
1015 			if (!inode->i_op || !inode->i_op->lookup)
1016 				break;
1017 		}
1018 		goto return_base;
1019 lookup_parent:
1020 		nd->last = this;
1021 		nd->last_type = LAST_NORM;
1022 		if (this.name[0] != '.')
1023 			goto return_base;
1024 		if (this.len == 1)
1025 			nd->last_type = LAST_DOT;
1026 		else if (this.len == 2 && this.name[1] == '.')
1027 			nd->last_type = LAST_DOTDOT;
1028 		else
1029 			goto return_base;
1030 return_reval:
1031 		/*
1032 		 * We bypassed the ordinary revalidation routines.
1033 		 * We may need to check the cached dentry for staleness.
1034 		 */
1035 		if (nd->path.dentry && nd->path.dentry->d_sb &&
1036 		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
1037 			err = -ESTALE;
1038 			/* Note: we do not d_invalidate() */
1039 			if (!nd->path.dentry->d_op->d_revalidate(
1040 					nd->path.dentry, nd))
1041 				break;
1042 		}
1043 return_base:
1044 		return 0;
1045 out_dput:
1046 		path_put_conditional(&next, nd);
1047 		break;
1048 	}
1049 	path_put(&nd->path);
1050 return_err:
1051 	return err;
1052 }
1053 
1054 static int path_walk(const char *name, struct nameidata *nd)
1055 {
1056 	current->total_link_count = 0;
1057 	return link_path_walk(name, nd);
1058 }
1059 
1060 /*
1061  * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
1062  * everything is done. Returns 0 and drops input nd, if lookup failed;
1063  */
1064 static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
1065 {
1066 	if (path_walk(name, nd))
1067 		return 0;		/* something went wrong... */
1068 
1069 	if (!nd->path.dentry->d_inode ||
1070 	    S_ISDIR(nd->path.dentry->d_inode->i_mode)) {
1071 		struct path old_path = nd->path;
1072 		struct qstr last = nd->last;
1073 		int last_type = nd->last_type;
1074 		struct fs_struct *fs = current->fs;
1075 
1076 		/*
1077 		 * NAME was not found in alternate root or it's a directory.
1078 		 * Try to find it in the normal root:
1079 		 */
1080 		nd->last_type = LAST_ROOT;
1081 		read_lock(&fs->lock);
1082 		nd->path = fs->root;
1083 		path_get(&fs->root);
1084 		read_unlock(&fs->lock);
1085 		if (path_walk(name, nd) == 0) {
1086 			if (nd->path.dentry->d_inode) {
1087 				path_put(&old_path);
1088 				return 1;
1089 			}
1090 			path_put(&nd->path);
1091 		}
1092 		nd->path = old_path;
1093 		nd->last = last;
1094 		nd->last_type = last_type;
1095 	}
1096 	return 1;
1097 }
1098 
1099 void set_fs_altroot(void)
1100 {
1101 	char *emul = __emul_prefix();
1102 	struct nameidata nd;
1103 	struct path path = {}, old_path;
1104 	int err;
1105 	struct fs_struct *fs = current->fs;
1106 
1107 	if (!emul)
1108 		goto set_it;
1109 	err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
1110 	if (!err)
1111 		path = nd.path;
1112 set_it:
1113 	write_lock(&fs->lock);
1114 	old_path = fs->altroot;
1115 	fs->altroot = path;
1116 	write_unlock(&fs->lock);
1117 	if (old_path.dentry)
1118 		path_put(&old_path);
1119 }
1120 
1121 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1122 static int do_path_lookup(int dfd, const char *name,
1123 				unsigned int flags, struct nameidata *nd)
1124 {
1125 	int retval = 0;
1126 	int fput_needed;
1127 	struct file *file;
1128 	struct fs_struct *fs = current->fs;
1129 
1130 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1131 	nd->flags = flags;
1132 	nd->depth = 0;
1133 
1134 	if (*name=='/') {
1135 		read_lock(&fs->lock);
1136 		if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
1137 			nd->path = fs->altroot;
1138 			path_get(&fs->altroot);
1139 			read_unlock(&fs->lock);
1140 			if (__emul_lookup_dentry(name,nd))
1141 				goto out; /* found in altroot */
1142 			read_lock(&fs->lock);
1143 		}
1144 		nd->path = fs->root;
1145 		path_get(&fs->root);
1146 		read_unlock(&fs->lock);
1147 	} else if (dfd == AT_FDCWD) {
1148 		read_lock(&fs->lock);
1149 		nd->path = fs->pwd;
1150 		path_get(&fs->pwd);
1151 		read_unlock(&fs->lock);
1152 	} else {
1153 		struct dentry *dentry;
1154 
1155 		file = fget_light(dfd, &fput_needed);
1156 		retval = -EBADF;
1157 		if (!file)
1158 			goto out_fail;
1159 
1160 		dentry = file->f_path.dentry;
1161 
1162 		retval = -ENOTDIR;
1163 		if (!S_ISDIR(dentry->d_inode->i_mode))
1164 			goto fput_fail;
1165 
1166 		retval = file_permission(file, MAY_EXEC);
1167 		if (retval)
1168 			goto fput_fail;
1169 
1170 		nd->path = file->f_path;
1171 		path_get(&file->f_path);
1172 
1173 		fput_light(file, fput_needed);
1174 	}
1175 
1176 	retval = path_walk(name, nd);
1177 out:
1178 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1179 				nd->path.dentry->d_inode))
1180 		audit_inode(name, nd->path.dentry);
1181 out_fail:
1182 	return retval;
1183 
1184 fput_fail:
1185 	fput_light(file, fput_needed);
1186 	goto out_fail;
1187 }
1188 
1189 int path_lookup(const char *name, unsigned int flags,
1190 			struct nameidata *nd)
1191 {
1192 	return do_path_lookup(AT_FDCWD, name, flags, nd);
1193 }
1194 
1195 /**
1196  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1197  * @dentry:  pointer to dentry of the base directory
1198  * @mnt: pointer to vfs mount of the base directory
1199  * @name: pointer to file name
1200  * @flags: lookup flags
1201  * @nd: pointer to nameidata
1202  */
1203 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1204 		    const char *name, unsigned int flags,
1205 		    struct nameidata *nd)
1206 {
1207 	int retval;
1208 
1209 	/* same as do_path_lookup */
1210 	nd->last_type = LAST_ROOT;
1211 	nd->flags = flags;
1212 	nd->depth = 0;
1213 
1214 	nd->path.mnt = mntget(mnt);
1215 	nd->path.dentry = dget(dentry);
1216 
1217 	retval = path_walk(name, nd);
1218 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1219 				nd->path.dentry->d_inode))
1220 		audit_inode(name, nd->path.dentry);
1221 
1222 	return retval;
1223 
1224 }
1225 
1226 static int __path_lookup_intent_open(int dfd, const char *name,
1227 		unsigned int lookup_flags, struct nameidata *nd,
1228 		int open_flags, int create_mode)
1229 {
1230 	struct file *filp = get_empty_filp();
1231 	int err;
1232 
1233 	if (filp == NULL)
1234 		return -ENFILE;
1235 	nd->intent.open.file = filp;
1236 	nd->intent.open.flags = open_flags;
1237 	nd->intent.open.create_mode = create_mode;
1238 	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
1239 	if (IS_ERR(nd->intent.open.file)) {
1240 		if (err == 0) {
1241 			err = PTR_ERR(nd->intent.open.file);
1242 			path_put(&nd->path);
1243 		}
1244 	} else if (err != 0)
1245 		release_open_intent(nd);
1246 	return err;
1247 }
1248 
1249 /**
1250  * path_lookup_open - lookup a file path with open intent
1251  * @dfd: the directory to use as base, or AT_FDCWD
1252  * @name: pointer to file name
1253  * @lookup_flags: lookup intent flags
1254  * @nd: pointer to nameidata
1255  * @open_flags: open intent flags
1256  */
1257 int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
1258 		struct nameidata *nd, int open_flags)
1259 {
1260 	return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
1261 			open_flags, 0);
1262 }
1263 
1264 /**
1265  * path_lookup_create - lookup a file path with open + create intent
1266  * @dfd: the directory to use as base, or AT_FDCWD
1267  * @name: pointer to file name
1268  * @lookup_flags: lookup intent flags
1269  * @nd: pointer to nameidata
1270  * @open_flags: open intent flags
1271  * @create_mode: create intent flags
1272  */
1273 static int path_lookup_create(int dfd, const char *name,
1274 			      unsigned int lookup_flags, struct nameidata *nd,
1275 			      int open_flags, int create_mode)
1276 {
1277 	return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
1278 			nd, open_flags, create_mode);
1279 }
1280 
1281 int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
1282 		struct nameidata *nd, int open_flags)
1283 {
1284 	char *tmp = getname(name);
1285 	int err = PTR_ERR(tmp);
1286 
1287 	if (!IS_ERR(tmp)) {
1288 		err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
1289 		putname(tmp);
1290 	}
1291 	return err;
1292 }
1293 
1294 static struct dentry *__lookup_hash(struct qstr *name,
1295 		struct dentry *base, struct nameidata *nd)
1296 {
1297 	struct dentry *dentry;
1298 	struct inode *inode;
1299 	int err;
1300 
1301 	inode = base->d_inode;
1302 
1303 	/*
1304 	 * See if the low-level filesystem might want
1305 	 * to use its own hash..
1306 	 */
1307 	if (base->d_op && base->d_op->d_hash) {
1308 		err = base->d_op->d_hash(base, name);
1309 		dentry = ERR_PTR(err);
1310 		if (err < 0)
1311 			goto out;
1312 	}
1313 
1314 	dentry = cached_lookup(base, name, nd);
1315 	if (!dentry) {
1316 		struct dentry *new = d_alloc(base, name);
1317 		dentry = ERR_PTR(-ENOMEM);
1318 		if (!new)
1319 			goto out;
1320 		dentry = inode->i_op->lookup(inode, new, nd);
1321 		if (!dentry)
1322 			dentry = new;
1323 		else
1324 			dput(new);
1325 	}
1326 out:
1327 	return dentry;
1328 }
1329 
1330 /*
1331  * Restricted form of lookup. Doesn't follow links, single-component only,
1332  * needs parent already locked. Doesn't follow mounts.
1333  * SMP-safe.
1334  */
1335 static struct dentry *lookup_hash(struct nameidata *nd)
1336 {
1337 	int err;
1338 
1339 	err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
1340 	if (err)
1341 		return ERR_PTR(err);
1342 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
1343 }
1344 
1345 static int __lookup_one_len(const char *name, struct qstr *this,
1346 		struct dentry *base, int len)
1347 {
1348 	unsigned long hash;
1349 	unsigned int c;
1350 
1351 	this->name = name;
1352 	this->len = len;
1353 	if (!len)
1354 		return -EACCES;
1355 
1356 	hash = init_name_hash();
1357 	while (len--) {
1358 		c = *(const unsigned char *)name++;
1359 		if (c == '/' || c == '\0')
1360 			return -EACCES;
1361 		hash = partial_name_hash(c, hash);
1362 	}
1363 	this->hash = end_name_hash(hash);
1364 	return 0;
1365 }
1366 
1367 /**
1368  * lookup_one_len - filesystem helper to lookup single pathname component
1369  * @name:	pathname component to lookup
1370  * @base:	base directory to lookup from
1371  * @len:	maximum length @len should be interpreted to
1372  *
1373  * Note that this routine is purely a helper for filesystem usage and should
1374  * not be called by generic code.  Also note that by using this function the
1375  * nameidata argument is passed to the filesystem methods and a filesystem
1376  * using this helper needs to be prepared for that.
1377  */
1378 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1379 {
1380 	int err;
1381 	struct qstr this;
1382 
1383 	err = __lookup_one_len(name, &this, base, len);
1384 	if (err)
1385 		return ERR_PTR(err);
1386 
1387 	err = permission(base->d_inode, MAY_EXEC, NULL);
1388 	if (err)
1389 		return ERR_PTR(err);
1390 	return __lookup_hash(&this, base, NULL);
1391 }
1392 
1393 /**
1394  * lookup_one_noperm - bad hack for sysfs
1395  * @name:	pathname component to lookup
1396  * @base:	base directory to lookup from
1397  *
1398  * This is a variant of lookup_one_len that doesn't perform any permission
1399  * checks.   It's a horrible hack to work around the braindead sysfs
1400  * architecture and should not be used anywhere else.
1401  *
1402  * DON'T USE THIS FUNCTION EVER, thanks.
1403  */
1404 struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
1405 {
1406 	int err;
1407 	struct qstr this;
1408 
1409 	err = __lookup_one_len(name, &this, base, strlen(name));
1410 	if (err)
1411 		return ERR_PTR(err);
1412 	return __lookup_hash(&this, base, NULL);
1413 }
1414 
1415 int __user_walk_fd(int dfd, const char __user *name, unsigned flags,
1416 			    struct nameidata *nd)
1417 {
1418 	char *tmp = getname(name);
1419 	int err = PTR_ERR(tmp);
1420 
1421 	if (!IS_ERR(tmp)) {
1422 		err = do_path_lookup(dfd, tmp, flags, nd);
1423 		putname(tmp);
1424 	}
1425 	return err;
1426 }
1427 
1428 int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
1429 {
1430 	return __user_walk_fd(AT_FDCWD, name, flags, nd);
1431 }
1432 
1433 /*
1434  * It's inline, so penalty for filesystems that don't use sticky bit is
1435  * minimal.
1436  */
1437 static inline int check_sticky(struct inode *dir, struct inode *inode)
1438 {
1439 	if (!(dir->i_mode & S_ISVTX))
1440 		return 0;
1441 	if (inode->i_uid == current->fsuid)
1442 		return 0;
1443 	if (dir->i_uid == current->fsuid)
1444 		return 0;
1445 	return !capable(CAP_FOWNER);
1446 }
1447 
1448 /*
1449  *	Check whether we can remove a link victim from directory dir, check
1450  *  whether the type of victim is right.
1451  *  1. We can't do it if dir is read-only (done in permission())
1452  *  2. We should have write and exec permissions on dir
1453  *  3. We can't remove anything from append-only dir
1454  *  4. We can't do anything with immutable dir (done in permission())
1455  *  5. If the sticky bit on dir is set we should either
1456  *	a. be owner of dir, or
1457  *	b. be owner of victim, or
1458  *	c. have CAP_FOWNER capability
1459  *  6. If the victim is append-only or immutable we can't do antyhing with
1460  *     links pointing to it.
1461  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1462  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1463  *  9. We can't remove a root or mountpoint.
1464  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1465  *     nfs_async_unlink().
1466  */
1467 static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1468 {
1469 	int error;
1470 
1471 	if (!victim->d_inode)
1472 		return -ENOENT;
1473 
1474 	BUG_ON(victim->d_parent->d_inode != dir);
1475 	audit_inode_child(victim->d_name.name, victim, dir);
1476 
1477 	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
1478 	if (error)
1479 		return error;
1480 	if (IS_APPEND(dir))
1481 		return -EPERM;
1482 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1483 	    IS_IMMUTABLE(victim->d_inode))
1484 		return -EPERM;
1485 	if (isdir) {
1486 		if (!S_ISDIR(victim->d_inode->i_mode))
1487 			return -ENOTDIR;
1488 		if (IS_ROOT(victim))
1489 			return -EBUSY;
1490 	} else if (S_ISDIR(victim->d_inode->i_mode))
1491 		return -EISDIR;
1492 	if (IS_DEADDIR(dir))
1493 		return -ENOENT;
1494 	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1495 		return -EBUSY;
1496 	return 0;
1497 }
1498 
1499 /*	Check whether we can create an object with dentry child in directory
1500  *  dir.
1501  *  1. We can't do it if child already exists (open has special treatment for
1502  *     this case, but since we are inlined it's OK)
1503  *  2. We can't do it if dir is read-only (done in permission())
1504  *  3. We should have write and exec permissions on dir
1505  *  4. We can't do it if dir is immutable (done in permission())
1506  */
1507 static inline int may_create(struct inode *dir, struct dentry *child,
1508 			     struct nameidata *nd)
1509 {
1510 	if (child->d_inode)
1511 		return -EEXIST;
1512 	if (IS_DEADDIR(dir))
1513 		return -ENOENT;
1514 	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
1515 }
1516 
1517 /*
1518  * O_DIRECTORY translates into forcing a directory lookup.
1519  */
1520 static inline int lookup_flags(unsigned int f)
1521 {
1522 	unsigned long retval = LOOKUP_FOLLOW;
1523 
1524 	if (f & O_NOFOLLOW)
1525 		retval &= ~LOOKUP_FOLLOW;
1526 
1527 	if (f & O_DIRECTORY)
1528 		retval |= LOOKUP_DIRECTORY;
1529 
1530 	return retval;
1531 }
1532 
1533 /*
1534  * p1 and p2 should be directories on the same fs.
1535  */
1536 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1537 {
1538 	struct dentry *p;
1539 
1540 	if (p1 == p2) {
1541 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1542 		return NULL;
1543 	}
1544 
1545 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1546 
1547 	for (p = p1; p->d_parent != p; p = p->d_parent) {
1548 		if (p->d_parent == p2) {
1549 			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1550 			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1551 			return p;
1552 		}
1553 	}
1554 
1555 	for (p = p2; p->d_parent != p; p = p->d_parent) {
1556 		if (p->d_parent == p1) {
1557 			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1558 			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1559 			return p;
1560 		}
1561 	}
1562 
1563 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1564 	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1565 	return NULL;
1566 }
1567 
1568 void unlock_rename(struct dentry *p1, struct dentry *p2)
1569 {
1570 	mutex_unlock(&p1->d_inode->i_mutex);
1571 	if (p1 != p2) {
1572 		mutex_unlock(&p2->d_inode->i_mutex);
1573 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1574 	}
1575 }
1576 
1577 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1578 		struct nameidata *nd)
1579 {
1580 	int error = may_create(dir, dentry, nd);
1581 
1582 	if (error)
1583 		return error;
1584 
1585 	if (!dir->i_op || !dir->i_op->create)
1586 		return -EACCES;	/* shouldn't it be ENOSYS? */
1587 	mode &= S_IALLUGO;
1588 	mode |= S_IFREG;
1589 	error = security_inode_create(dir, dentry, mode);
1590 	if (error)
1591 		return error;
1592 	DQUOT_INIT(dir);
1593 	error = dir->i_op->create(dir, dentry, mode, nd);
1594 	if (!error)
1595 		fsnotify_create(dir, dentry);
1596 	return error;
1597 }
1598 
1599 int may_open(struct nameidata *nd, int acc_mode, int flag)
1600 {
1601 	struct dentry *dentry = nd->path.dentry;
1602 	struct inode *inode = dentry->d_inode;
1603 	int error;
1604 
1605 	if (!inode)
1606 		return -ENOENT;
1607 
1608 	if (S_ISLNK(inode->i_mode))
1609 		return -ELOOP;
1610 
1611 	if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
1612 		return -EISDIR;
1613 
1614 	/*
1615 	 * FIFO's, sockets and device files are special: they don't
1616 	 * actually live on the filesystem itself, and as such you
1617 	 * can write to them even if the filesystem is read-only.
1618 	 */
1619 	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1620 	    	flag &= ~O_TRUNC;
1621 	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1622 		if (nd->path.mnt->mnt_flags & MNT_NODEV)
1623 			return -EACCES;
1624 
1625 		flag &= ~O_TRUNC;
1626 	} else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
1627 		return -EROFS;
1628 
1629 	error = vfs_permission(nd, acc_mode);
1630 	if (error)
1631 		return error;
1632 	/*
1633 	 * An append-only file must be opened in append mode for writing.
1634 	 */
1635 	if (IS_APPEND(inode)) {
1636 		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1637 			return -EPERM;
1638 		if (flag & O_TRUNC)
1639 			return -EPERM;
1640 	}
1641 
1642 	/* O_NOATIME can only be set by the owner or superuser */
1643 	if (flag & O_NOATIME)
1644 		if (!is_owner_or_cap(inode))
1645 			return -EPERM;
1646 
1647 	/*
1648 	 * Ensure there are no outstanding leases on the file.
1649 	 */
1650 	error = break_lease(inode, flag);
1651 	if (error)
1652 		return error;
1653 
1654 	if (flag & O_TRUNC) {
1655 		error = get_write_access(inode);
1656 		if (error)
1657 			return error;
1658 
1659 		/*
1660 		 * Refuse to truncate files with mandatory locks held on them.
1661 		 */
1662 		error = locks_verify_locked(inode);
1663 		if (!error) {
1664 			DQUOT_INIT(inode);
1665 
1666 			error = do_truncate(dentry, 0,
1667 					    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1668 					    NULL);
1669 		}
1670 		put_write_access(inode);
1671 		if (error)
1672 			return error;
1673 	} else
1674 		if (flag & FMODE_WRITE)
1675 			DQUOT_INIT(inode);
1676 
1677 	return 0;
1678 }
1679 
1680 static int open_namei_create(struct nameidata *nd, struct path *path,
1681 				int flag, int mode)
1682 {
1683 	int error;
1684 	struct dentry *dir = nd->path.dentry;
1685 
1686 	if (!IS_POSIXACL(dir->d_inode))
1687 		mode &= ~current->fs->umask;
1688 	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1689 	mutex_unlock(&dir->d_inode->i_mutex);
1690 	dput(nd->path.dentry);
1691 	nd->path.dentry = path->dentry;
1692 	if (error)
1693 		return error;
1694 	/* Don't check for write permission, don't truncate */
1695 	return may_open(nd, 0, flag & ~O_TRUNC);
1696 }
1697 
1698 /*
1699  *	open_namei()
1700  *
1701  * namei for open - this is in fact almost the whole open-routine.
1702  *
1703  * Note that the low bits of "flag" aren't the same as in the open
1704  * system call - they are 00 - no permissions needed
1705  *			  01 - read permission needed
1706  *			  10 - write permission needed
1707  *			  11 - read/write permissions needed
1708  * which is a lot more logical, and also allows the "no perm" needed
1709  * for symlinks (where the permissions are checked later).
1710  * SMP-safe
1711  */
1712 int open_namei(int dfd, const char *pathname, int flag,
1713 		int mode, struct nameidata *nd)
1714 {
1715 	int acc_mode, error;
1716 	struct path path;
1717 	struct dentry *dir;
1718 	int count = 0;
1719 
1720 	acc_mode = ACC_MODE(flag);
1721 
1722 	/* O_TRUNC implies we need access checks for write permissions */
1723 	if (flag & O_TRUNC)
1724 		acc_mode |= MAY_WRITE;
1725 
1726 	/* Allow the LSM permission hook to distinguish append
1727 	   access from general write access. */
1728 	if (flag & O_APPEND)
1729 		acc_mode |= MAY_APPEND;
1730 
1731 	/*
1732 	 * The simplest case - just a plain lookup.
1733 	 */
1734 	if (!(flag & O_CREAT)) {
1735 		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
1736 					 nd, flag);
1737 		if (error)
1738 			return error;
1739 		goto ok;
1740 	}
1741 
1742 	/*
1743 	 * Create - we need to know the parent.
1744 	 */
1745 	error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
1746 	if (error)
1747 		return error;
1748 
1749 	/*
1750 	 * We have the parent and last component. First of all, check
1751 	 * that we are not asked to creat(2) an obvious directory - that
1752 	 * will not do.
1753 	 */
1754 	error = -EISDIR;
1755 	if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
1756 		goto exit;
1757 
1758 	dir = nd->path.dentry;
1759 	nd->flags &= ~LOOKUP_PARENT;
1760 	mutex_lock(&dir->d_inode->i_mutex);
1761 	path.dentry = lookup_hash(nd);
1762 	path.mnt = nd->path.mnt;
1763 
1764 do_last:
1765 	error = PTR_ERR(path.dentry);
1766 	if (IS_ERR(path.dentry)) {
1767 		mutex_unlock(&dir->d_inode->i_mutex);
1768 		goto exit;
1769 	}
1770 
1771 	if (IS_ERR(nd->intent.open.file)) {
1772 		mutex_unlock(&dir->d_inode->i_mutex);
1773 		error = PTR_ERR(nd->intent.open.file);
1774 		goto exit_dput;
1775 	}
1776 
1777 	/* Negative dentry, just create the file */
1778 	if (!path.dentry->d_inode) {
1779 		error = open_namei_create(nd, &path, flag, mode);
1780 		if (error)
1781 			goto exit;
1782 		return 0;
1783 	}
1784 
1785 	/*
1786 	 * It already exists.
1787 	 */
1788 	mutex_unlock(&dir->d_inode->i_mutex);
1789 	audit_inode(pathname, path.dentry);
1790 
1791 	error = -EEXIST;
1792 	if (flag & O_EXCL)
1793 		goto exit_dput;
1794 
1795 	if (__follow_mount(&path)) {
1796 		error = -ELOOP;
1797 		if (flag & O_NOFOLLOW)
1798 			goto exit_dput;
1799 	}
1800 
1801 	error = -ENOENT;
1802 	if (!path.dentry->d_inode)
1803 		goto exit_dput;
1804 	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
1805 		goto do_link;
1806 
1807 	path_to_nameidata(&path, nd);
1808 	error = -EISDIR;
1809 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1810 		goto exit;
1811 ok:
1812 	error = may_open(nd, acc_mode, flag);
1813 	if (error)
1814 		goto exit;
1815 	return 0;
1816 
1817 exit_dput:
1818 	path_put_conditional(&path, nd);
1819 exit:
1820 	if (!IS_ERR(nd->intent.open.file))
1821 		release_open_intent(nd);
1822 	path_put(&nd->path);
1823 	return error;
1824 
1825 do_link:
1826 	error = -ELOOP;
1827 	if (flag & O_NOFOLLOW)
1828 		goto exit_dput;
1829 	/*
1830 	 * This is subtle. Instead of calling do_follow_link() we do the
1831 	 * thing by hands. The reason is that this way we have zero link_count
1832 	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1833 	 * After that we have the parent and last component, i.e.
1834 	 * we are in the same situation as after the first path_walk().
1835 	 * Well, almost - if the last component is normal we get its copy
1836 	 * stored in nd->last.name and we will have to putname() it when we
1837 	 * are done. Procfs-like symlinks just set LAST_BIND.
1838 	 */
1839 	nd->flags |= LOOKUP_PARENT;
1840 	error = security_inode_follow_link(path.dentry, nd);
1841 	if (error)
1842 		goto exit_dput;
1843 	error = __do_follow_link(&path, nd);
1844 	if (error) {
1845 		/* Does someone understand code flow here? Or it is only
1846 		 * me so stupid? Anathema to whoever designed this non-sense
1847 		 * with "intent.open".
1848 		 */
1849 		release_open_intent(nd);
1850 		return error;
1851 	}
1852 	nd->flags &= ~LOOKUP_PARENT;
1853 	if (nd->last_type == LAST_BIND)
1854 		goto ok;
1855 	error = -EISDIR;
1856 	if (nd->last_type != LAST_NORM)
1857 		goto exit;
1858 	if (nd->last.name[nd->last.len]) {
1859 		__putname(nd->last.name);
1860 		goto exit;
1861 	}
1862 	error = -ELOOP;
1863 	if (count++==32) {
1864 		__putname(nd->last.name);
1865 		goto exit;
1866 	}
1867 	dir = nd->path.dentry;
1868 	mutex_lock(&dir->d_inode->i_mutex);
1869 	path.dentry = lookup_hash(nd);
1870 	path.mnt = nd->path.mnt;
1871 	__putname(nd->last.name);
1872 	goto do_last;
1873 }
1874 
1875 /**
1876  * lookup_create - lookup a dentry, creating it if it doesn't exist
1877  * @nd: nameidata info
1878  * @is_dir: directory flag
1879  *
1880  * Simple function to lookup and return a dentry and create it
1881  * if it doesn't exist.  Is SMP-safe.
1882  *
1883  * Returns with nd->path.dentry->d_inode->i_mutex locked.
1884  */
1885 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1886 {
1887 	struct dentry *dentry = ERR_PTR(-EEXIST);
1888 
1889 	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1890 	/*
1891 	 * Yucky last component or no last component at all?
1892 	 * (foo/., foo/.., /////)
1893 	 */
1894 	if (nd->last_type != LAST_NORM)
1895 		goto fail;
1896 	nd->flags &= ~LOOKUP_PARENT;
1897 	nd->flags |= LOOKUP_CREATE;
1898 	nd->intent.open.flags = O_EXCL;
1899 
1900 	/*
1901 	 * Do the final lookup.
1902 	 */
1903 	dentry = lookup_hash(nd);
1904 	if (IS_ERR(dentry))
1905 		goto fail;
1906 
1907 	/*
1908 	 * Special case - lookup gave negative, but... we had foo/bar/
1909 	 * From the vfs_mknod() POV we just have a negative dentry -
1910 	 * all is fine. Let's be bastards - you had / on the end, you've
1911 	 * been asking for (non-existent) directory. -ENOENT for you.
1912 	 */
1913 	if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
1914 		goto enoent;
1915 	return dentry;
1916 enoent:
1917 	dput(dentry);
1918 	dentry = ERR_PTR(-ENOENT);
1919 fail:
1920 	return dentry;
1921 }
1922 EXPORT_SYMBOL_GPL(lookup_create);
1923 
1924 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1925 {
1926 	int error = may_create(dir, dentry, NULL);
1927 
1928 	if (error)
1929 		return error;
1930 
1931 	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1932 		return -EPERM;
1933 
1934 	if (!dir->i_op || !dir->i_op->mknod)
1935 		return -EPERM;
1936 
1937 	error = security_inode_mknod(dir, dentry, mode, dev);
1938 	if (error)
1939 		return error;
1940 
1941 	DQUOT_INIT(dir);
1942 	error = dir->i_op->mknod(dir, dentry, mode, dev);
1943 	if (!error)
1944 		fsnotify_create(dir, dentry);
1945 	return error;
1946 }
1947 
1948 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1949 				unsigned dev)
1950 {
1951 	int error = 0;
1952 	char * tmp;
1953 	struct dentry * dentry;
1954 	struct nameidata nd;
1955 
1956 	if (S_ISDIR(mode))
1957 		return -EPERM;
1958 	tmp = getname(filename);
1959 	if (IS_ERR(tmp))
1960 		return PTR_ERR(tmp);
1961 
1962 	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
1963 	if (error)
1964 		goto out;
1965 	dentry = lookup_create(&nd, 0);
1966 	error = PTR_ERR(dentry);
1967 
1968 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
1969 		mode &= ~current->fs->umask;
1970 	if (!IS_ERR(dentry)) {
1971 		switch (mode & S_IFMT) {
1972 		case 0: case S_IFREG:
1973 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
1974 			break;
1975 		case S_IFCHR: case S_IFBLK:
1976 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
1977 					new_decode_dev(dev));
1978 			break;
1979 		case S_IFIFO: case S_IFSOCK:
1980 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
1981 			break;
1982 		case S_IFDIR:
1983 			error = -EPERM;
1984 			break;
1985 		default:
1986 			error = -EINVAL;
1987 		}
1988 		dput(dentry);
1989 	}
1990 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
1991 	path_put(&nd.path);
1992 out:
1993 	putname(tmp);
1994 
1995 	return error;
1996 }
1997 
1998 asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
1999 {
2000 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
2001 }
2002 
2003 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2004 {
2005 	int error = may_create(dir, dentry, NULL);
2006 
2007 	if (error)
2008 		return error;
2009 
2010 	if (!dir->i_op || !dir->i_op->mkdir)
2011 		return -EPERM;
2012 
2013 	mode &= (S_IRWXUGO|S_ISVTX);
2014 	error = security_inode_mkdir(dir, dentry, mode);
2015 	if (error)
2016 		return error;
2017 
2018 	DQUOT_INIT(dir);
2019 	error = dir->i_op->mkdir(dir, dentry, mode);
2020 	if (!error)
2021 		fsnotify_mkdir(dir, dentry);
2022 	return error;
2023 }
2024 
2025 asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2026 {
2027 	int error = 0;
2028 	char * tmp;
2029 	struct dentry *dentry;
2030 	struct nameidata nd;
2031 
2032 	tmp = getname(pathname);
2033 	error = PTR_ERR(tmp);
2034 	if (IS_ERR(tmp))
2035 		goto out_err;
2036 
2037 	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
2038 	if (error)
2039 		goto out;
2040 	dentry = lookup_create(&nd, 1);
2041 	error = PTR_ERR(dentry);
2042 	if (IS_ERR(dentry))
2043 		goto out_unlock;
2044 
2045 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2046 		mode &= ~current->fs->umask;
2047 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2048 	dput(dentry);
2049 out_unlock:
2050 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2051 	path_put(&nd.path);
2052 out:
2053 	putname(tmp);
2054 out_err:
2055 	return error;
2056 }
2057 
2058 asmlinkage long sys_mkdir(const char __user *pathname, int mode)
2059 {
2060 	return sys_mkdirat(AT_FDCWD, pathname, mode);
2061 }
2062 
2063 /*
2064  * We try to drop the dentry early: we should have
2065  * a usage count of 2 if we're the only user of this
2066  * dentry, and if that is true (possibly after pruning
2067  * the dcache), then we drop the dentry now.
2068  *
2069  * A low-level filesystem can, if it choses, legally
2070  * do a
2071  *
2072  *	if (!d_unhashed(dentry))
2073  *		return -EBUSY;
2074  *
2075  * if it cannot handle the case of removing a directory
2076  * that is still in use by something else..
2077  */
2078 void dentry_unhash(struct dentry *dentry)
2079 {
2080 	dget(dentry);
2081 	shrink_dcache_parent(dentry);
2082 	spin_lock(&dcache_lock);
2083 	spin_lock(&dentry->d_lock);
2084 	if (atomic_read(&dentry->d_count) == 2)
2085 		__d_drop(dentry);
2086 	spin_unlock(&dentry->d_lock);
2087 	spin_unlock(&dcache_lock);
2088 }
2089 
2090 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2091 {
2092 	int error = may_delete(dir, dentry, 1);
2093 
2094 	if (error)
2095 		return error;
2096 
2097 	if (!dir->i_op || !dir->i_op->rmdir)
2098 		return -EPERM;
2099 
2100 	DQUOT_INIT(dir);
2101 
2102 	mutex_lock(&dentry->d_inode->i_mutex);
2103 	dentry_unhash(dentry);
2104 	if (d_mountpoint(dentry))
2105 		error = -EBUSY;
2106 	else {
2107 		error = security_inode_rmdir(dir, dentry);
2108 		if (!error) {
2109 			error = dir->i_op->rmdir(dir, dentry);
2110 			if (!error)
2111 				dentry->d_inode->i_flags |= S_DEAD;
2112 		}
2113 	}
2114 	mutex_unlock(&dentry->d_inode->i_mutex);
2115 	if (!error) {
2116 		d_delete(dentry);
2117 	}
2118 	dput(dentry);
2119 
2120 	return error;
2121 }
2122 
2123 static long do_rmdir(int dfd, const char __user *pathname)
2124 {
2125 	int error = 0;
2126 	char * name;
2127 	struct dentry *dentry;
2128 	struct nameidata nd;
2129 
2130 	name = getname(pathname);
2131 	if(IS_ERR(name))
2132 		return PTR_ERR(name);
2133 
2134 	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
2135 	if (error)
2136 		goto exit;
2137 
2138 	switch(nd.last_type) {
2139 		case LAST_DOTDOT:
2140 			error = -ENOTEMPTY;
2141 			goto exit1;
2142 		case LAST_DOT:
2143 			error = -EINVAL;
2144 			goto exit1;
2145 		case LAST_ROOT:
2146 			error = -EBUSY;
2147 			goto exit1;
2148 	}
2149 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2150 	dentry = lookup_hash(&nd);
2151 	error = PTR_ERR(dentry);
2152 	if (IS_ERR(dentry))
2153 		goto exit2;
2154 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2155 	dput(dentry);
2156 exit2:
2157 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2158 exit1:
2159 	path_put(&nd.path);
2160 exit:
2161 	putname(name);
2162 	return error;
2163 }
2164 
2165 asmlinkage long sys_rmdir(const char __user *pathname)
2166 {
2167 	return do_rmdir(AT_FDCWD, pathname);
2168 }
2169 
2170 int vfs_unlink(struct inode *dir, struct dentry *dentry)
2171 {
2172 	int error = may_delete(dir, dentry, 0);
2173 
2174 	if (error)
2175 		return error;
2176 
2177 	if (!dir->i_op || !dir->i_op->unlink)
2178 		return -EPERM;
2179 
2180 	DQUOT_INIT(dir);
2181 
2182 	mutex_lock(&dentry->d_inode->i_mutex);
2183 	if (d_mountpoint(dentry))
2184 		error = -EBUSY;
2185 	else {
2186 		error = security_inode_unlink(dir, dentry);
2187 		if (!error)
2188 			error = dir->i_op->unlink(dir, dentry);
2189 	}
2190 	mutex_unlock(&dentry->d_inode->i_mutex);
2191 
2192 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
2193 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2194 		fsnotify_link_count(dentry->d_inode);
2195 		d_delete(dentry);
2196 	}
2197 
2198 	return error;
2199 }
2200 
2201 /*
2202  * Make sure that the actual truncation of the file will occur outside its
2203  * directory's i_mutex.  Truncate can take a long time if there is a lot of
2204  * writeout happening, and we don't want to prevent access to the directory
2205  * while waiting on the I/O.
2206  */
2207 static long do_unlinkat(int dfd, const char __user *pathname)
2208 {
2209 	int error = 0;
2210 	char * name;
2211 	struct dentry *dentry;
2212 	struct nameidata nd;
2213 	struct inode *inode = NULL;
2214 
2215 	name = getname(pathname);
2216 	if(IS_ERR(name))
2217 		return PTR_ERR(name);
2218 
2219 	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
2220 	if (error)
2221 		goto exit;
2222 	error = -EISDIR;
2223 	if (nd.last_type != LAST_NORM)
2224 		goto exit1;
2225 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2226 	dentry = lookup_hash(&nd);
2227 	error = PTR_ERR(dentry);
2228 	if (!IS_ERR(dentry)) {
2229 		/* Why not before? Because we want correct error value */
2230 		if (nd.last.name[nd.last.len])
2231 			goto slashes;
2232 		inode = dentry->d_inode;
2233 		if (inode)
2234 			atomic_inc(&inode->i_count);
2235 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2236 	exit2:
2237 		dput(dentry);
2238 	}
2239 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2240 	if (inode)
2241 		iput(inode);	/* truncate the inode here */
2242 exit1:
2243 	path_put(&nd.path);
2244 exit:
2245 	putname(name);
2246 	return error;
2247 
2248 slashes:
2249 	error = !dentry->d_inode ? -ENOENT :
2250 		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2251 	goto exit2;
2252 }
2253 
2254 asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
2255 {
2256 	if ((flag & ~AT_REMOVEDIR) != 0)
2257 		return -EINVAL;
2258 
2259 	if (flag & AT_REMOVEDIR)
2260 		return do_rmdir(dfd, pathname);
2261 
2262 	return do_unlinkat(dfd, pathname);
2263 }
2264 
2265 asmlinkage long sys_unlink(const char __user *pathname)
2266 {
2267 	return do_unlinkat(AT_FDCWD, pathname);
2268 }
2269 
2270 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
2271 {
2272 	int error = may_create(dir, dentry, NULL);
2273 
2274 	if (error)
2275 		return error;
2276 
2277 	if (!dir->i_op || !dir->i_op->symlink)
2278 		return -EPERM;
2279 
2280 	error = security_inode_symlink(dir, dentry, oldname);
2281 	if (error)
2282 		return error;
2283 
2284 	DQUOT_INIT(dir);
2285 	error = dir->i_op->symlink(dir, dentry, oldname);
2286 	if (!error)
2287 		fsnotify_create(dir, dentry);
2288 	return error;
2289 }
2290 
2291 asmlinkage long sys_symlinkat(const char __user *oldname,
2292 			      int newdfd, const char __user *newname)
2293 {
2294 	int error = 0;
2295 	char * from;
2296 	char * to;
2297 	struct dentry *dentry;
2298 	struct nameidata nd;
2299 
2300 	from = getname(oldname);
2301 	if(IS_ERR(from))
2302 		return PTR_ERR(from);
2303 	to = getname(newname);
2304 	error = PTR_ERR(to);
2305 	if (IS_ERR(to))
2306 		goto out_putname;
2307 
2308 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2309 	if (error)
2310 		goto out;
2311 	dentry = lookup_create(&nd, 0);
2312 	error = PTR_ERR(dentry);
2313 	if (IS_ERR(dentry))
2314 		goto out_unlock;
2315 
2316 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
2317 	dput(dentry);
2318 out_unlock:
2319 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2320 	path_put(&nd.path);
2321 out:
2322 	putname(to);
2323 out_putname:
2324 	putname(from);
2325 	return error;
2326 }
2327 
2328 asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
2329 {
2330 	return sys_symlinkat(oldname, AT_FDCWD, newname);
2331 }
2332 
2333 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2334 {
2335 	struct inode *inode = old_dentry->d_inode;
2336 	int error;
2337 
2338 	if (!inode)
2339 		return -ENOENT;
2340 
2341 	error = may_create(dir, new_dentry, NULL);
2342 	if (error)
2343 		return error;
2344 
2345 	if (dir->i_sb != inode->i_sb)
2346 		return -EXDEV;
2347 
2348 	/*
2349 	 * A link to an append-only or immutable file cannot be created.
2350 	 */
2351 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2352 		return -EPERM;
2353 	if (!dir->i_op || !dir->i_op->link)
2354 		return -EPERM;
2355 	if (S_ISDIR(old_dentry->d_inode->i_mode))
2356 		return -EPERM;
2357 
2358 	error = security_inode_link(old_dentry, dir, new_dentry);
2359 	if (error)
2360 		return error;
2361 
2362 	mutex_lock(&old_dentry->d_inode->i_mutex);
2363 	DQUOT_INIT(dir);
2364 	error = dir->i_op->link(old_dentry, dir, new_dentry);
2365 	mutex_unlock(&old_dentry->d_inode->i_mutex);
2366 	if (!error)
2367 		fsnotify_link(dir, old_dentry->d_inode, new_dentry);
2368 	return error;
2369 }
2370 
2371 /*
2372  * Hardlinks are often used in delicate situations.  We avoid
2373  * security-related surprises by not following symlinks on the
2374  * newname.  --KAB
2375  *
2376  * We don't follow them on the oldname either to be compatible
2377  * with linux 2.0, and to avoid hard-linking to directories
2378  * and other special files.  --ADM
2379  */
2380 asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2381 			   int newdfd, const char __user *newname,
2382 			   int flags)
2383 {
2384 	struct dentry *new_dentry;
2385 	struct nameidata nd, old_nd;
2386 	int error;
2387 	char * to;
2388 
2389 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2390 		return -EINVAL;
2391 
2392 	to = getname(newname);
2393 	if (IS_ERR(to))
2394 		return PTR_ERR(to);
2395 
2396 	error = __user_walk_fd(olddfd, oldname,
2397 			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2398 			       &old_nd);
2399 	if (error)
2400 		goto exit;
2401 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2402 	if (error)
2403 		goto out;
2404 	error = -EXDEV;
2405 	if (old_nd.path.mnt != nd.path.mnt)
2406 		goto out_release;
2407 	new_dentry = lookup_create(&nd, 0);
2408 	error = PTR_ERR(new_dentry);
2409 	if (IS_ERR(new_dentry))
2410 		goto out_unlock;
2411 	error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
2412 	dput(new_dentry);
2413 out_unlock:
2414 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2415 out_release:
2416 	path_put(&nd.path);
2417 out:
2418 	path_put(&old_nd.path);
2419 exit:
2420 	putname(to);
2421 
2422 	return error;
2423 }
2424 
2425 asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
2426 {
2427 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2428 }
2429 
2430 /*
2431  * The worst of all namespace operations - renaming directory. "Perverted"
2432  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2433  * Problems:
2434  *	a) we can get into loop creation. Check is done in is_subdir().
2435  *	b) race potential - two innocent renames can create a loop together.
2436  *	   That's where 4.4 screws up. Current fix: serialization on
2437  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
2438  *	   story.
2439  *	c) we have to lock _three_ objects - parents and victim (if it exists).
2440  *	   And that - after we got ->i_mutex on parents (until then we don't know
2441  *	   whether the target exists).  Solution: try to be smart with locking
2442  *	   order for inodes.  We rely on the fact that tree topology may change
2443  *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
2444  *	   move will be locked.  Thus we can rank directories by the tree
2445  *	   (ancestors first) and rank all non-directories after them.
2446  *	   That works since everybody except rename does "lock parent, lookup,
2447  *	   lock child" and rename is under ->s_vfs_rename_mutex.
2448  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
2449  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
2450  *	   we'd better make sure that there's no link(2) for them.
2451  *	d) some filesystems don't support opened-but-unlinked directories,
2452  *	   either because of layout or because they are not ready to deal with
2453  *	   all cases correctly. The latter will be fixed (taking this sort of
2454  *	   stuff into VFS), but the former is not going away. Solution: the same
2455  *	   trick as in rmdir().
2456  *	e) conversion from fhandle to dentry may come in the wrong moment - when
2457  *	   we are removing the target. Solution: we will have to grab ->i_mutex
2458  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2459  *	   ->i_mutex on parents, which works but leads to some truely excessive
2460  *	   locking].
2461  */
2462 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2463 			  struct inode *new_dir, struct dentry *new_dentry)
2464 {
2465 	int error = 0;
2466 	struct inode *target;
2467 
2468 	/*
2469 	 * If we are going to change the parent - check write permissions,
2470 	 * we'll need to flip '..'.
2471 	 */
2472 	if (new_dir != old_dir) {
2473 		error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
2474 		if (error)
2475 			return error;
2476 	}
2477 
2478 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2479 	if (error)
2480 		return error;
2481 
2482 	target = new_dentry->d_inode;
2483 	if (target) {
2484 		mutex_lock(&target->i_mutex);
2485 		dentry_unhash(new_dentry);
2486 	}
2487 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2488 		error = -EBUSY;
2489 	else
2490 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2491 	if (target) {
2492 		if (!error)
2493 			target->i_flags |= S_DEAD;
2494 		mutex_unlock(&target->i_mutex);
2495 		if (d_unhashed(new_dentry))
2496 			d_rehash(new_dentry);
2497 		dput(new_dentry);
2498 	}
2499 	if (!error)
2500 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2501 			d_move(old_dentry,new_dentry);
2502 	return error;
2503 }
2504 
2505 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2506 			    struct inode *new_dir, struct dentry *new_dentry)
2507 {
2508 	struct inode *target;
2509 	int error;
2510 
2511 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2512 	if (error)
2513 		return error;
2514 
2515 	dget(new_dentry);
2516 	target = new_dentry->d_inode;
2517 	if (target)
2518 		mutex_lock(&target->i_mutex);
2519 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2520 		error = -EBUSY;
2521 	else
2522 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2523 	if (!error) {
2524 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2525 			d_move(old_dentry, new_dentry);
2526 	}
2527 	if (target)
2528 		mutex_unlock(&target->i_mutex);
2529 	dput(new_dentry);
2530 	return error;
2531 }
2532 
2533 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2534 	       struct inode *new_dir, struct dentry *new_dentry)
2535 {
2536 	int error;
2537 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2538 	const char *old_name;
2539 
2540 	if (old_dentry->d_inode == new_dentry->d_inode)
2541  		return 0;
2542 
2543 	error = may_delete(old_dir, old_dentry, is_dir);
2544 	if (error)
2545 		return error;
2546 
2547 	if (!new_dentry->d_inode)
2548 		error = may_create(new_dir, new_dentry, NULL);
2549 	else
2550 		error = may_delete(new_dir, new_dentry, is_dir);
2551 	if (error)
2552 		return error;
2553 
2554 	if (!old_dir->i_op || !old_dir->i_op->rename)
2555 		return -EPERM;
2556 
2557 	DQUOT_INIT(old_dir);
2558 	DQUOT_INIT(new_dir);
2559 
2560 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2561 
2562 	if (is_dir)
2563 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2564 	else
2565 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2566 	if (!error) {
2567 		const char *new_name = old_dentry->d_name.name;
2568 		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2569 			      new_dentry->d_inode, old_dentry);
2570 	}
2571 	fsnotify_oldname_free(old_name);
2572 
2573 	return error;
2574 }
2575 
2576 static int do_rename(int olddfd, const char *oldname,
2577 			int newdfd, const char *newname)
2578 {
2579 	int error = 0;
2580 	struct dentry * old_dir, * new_dir;
2581 	struct dentry * old_dentry, *new_dentry;
2582 	struct dentry * trap;
2583 	struct nameidata oldnd, newnd;
2584 
2585 	error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
2586 	if (error)
2587 		goto exit;
2588 
2589 	error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
2590 	if (error)
2591 		goto exit1;
2592 
2593 	error = -EXDEV;
2594 	if (oldnd.path.mnt != newnd.path.mnt)
2595 		goto exit2;
2596 
2597 	old_dir = oldnd.path.dentry;
2598 	error = -EBUSY;
2599 	if (oldnd.last_type != LAST_NORM)
2600 		goto exit2;
2601 
2602 	new_dir = newnd.path.dentry;
2603 	if (newnd.last_type != LAST_NORM)
2604 		goto exit2;
2605 
2606 	trap = lock_rename(new_dir, old_dir);
2607 
2608 	old_dentry = lookup_hash(&oldnd);
2609 	error = PTR_ERR(old_dentry);
2610 	if (IS_ERR(old_dentry))
2611 		goto exit3;
2612 	/* source must exist */
2613 	error = -ENOENT;
2614 	if (!old_dentry->d_inode)
2615 		goto exit4;
2616 	/* unless the source is a directory trailing slashes give -ENOTDIR */
2617 	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2618 		error = -ENOTDIR;
2619 		if (oldnd.last.name[oldnd.last.len])
2620 			goto exit4;
2621 		if (newnd.last.name[newnd.last.len])
2622 			goto exit4;
2623 	}
2624 	/* source should not be ancestor of target */
2625 	error = -EINVAL;
2626 	if (old_dentry == trap)
2627 		goto exit4;
2628 	new_dentry = lookup_hash(&newnd);
2629 	error = PTR_ERR(new_dentry);
2630 	if (IS_ERR(new_dentry))
2631 		goto exit4;
2632 	/* target should not be an ancestor of source */
2633 	error = -ENOTEMPTY;
2634 	if (new_dentry == trap)
2635 		goto exit5;
2636 
2637 	error = vfs_rename(old_dir->d_inode, old_dentry,
2638 				   new_dir->d_inode, new_dentry);
2639 exit5:
2640 	dput(new_dentry);
2641 exit4:
2642 	dput(old_dentry);
2643 exit3:
2644 	unlock_rename(new_dir, old_dir);
2645 exit2:
2646 	path_put(&newnd.path);
2647 exit1:
2648 	path_put(&oldnd.path);
2649 exit:
2650 	return error;
2651 }
2652 
2653 asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2654 			     int newdfd, const char __user *newname)
2655 {
2656 	int error;
2657 	char * from;
2658 	char * to;
2659 
2660 	from = getname(oldname);
2661 	if(IS_ERR(from))
2662 		return PTR_ERR(from);
2663 	to = getname(newname);
2664 	error = PTR_ERR(to);
2665 	if (!IS_ERR(to)) {
2666 		error = do_rename(olddfd, from, newdfd, to);
2667 		putname(to);
2668 	}
2669 	putname(from);
2670 	return error;
2671 }
2672 
2673 asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
2674 {
2675 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2676 }
2677 
2678 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2679 {
2680 	int len;
2681 
2682 	len = PTR_ERR(link);
2683 	if (IS_ERR(link))
2684 		goto out;
2685 
2686 	len = strlen(link);
2687 	if (len > (unsigned) buflen)
2688 		len = buflen;
2689 	if (copy_to_user(buffer, link, len))
2690 		len = -EFAULT;
2691 out:
2692 	return len;
2693 }
2694 
2695 /*
2696  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
2697  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
2698  * using) it for any given inode is up to filesystem.
2699  */
2700 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2701 {
2702 	struct nameidata nd;
2703 	void *cookie;
2704 
2705 	nd.depth = 0;
2706 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
2707 	if (!IS_ERR(cookie)) {
2708 		int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2709 		if (dentry->d_inode->i_op->put_link)
2710 			dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
2711 		cookie = ERR_PTR(res);
2712 	}
2713 	return PTR_ERR(cookie);
2714 }
2715 
2716 int vfs_follow_link(struct nameidata *nd, const char *link)
2717 {
2718 	return __vfs_follow_link(nd, link);
2719 }
2720 
2721 /* get the link contents into pagecache */
2722 static char *page_getlink(struct dentry * dentry, struct page **ppage)
2723 {
2724 	struct page * page;
2725 	struct address_space *mapping = dentry->d_inode->i_mapping;
2726 	page = read_mapping_page(mapping, 0, NULL);
2727 	if (IS_ERR(page))
2728 		return (char*)page;
2729 	*ppage = page;
2730 	return kmap(page);
2731 }
2732 
2733 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2734 {
2735 	struct page *page = NULL;
2736 	char *s = page_getlink(dentry, &page);
2737 	int res = vfs_readlink(dentry,buffer,buflen,s);
2738 	if (page) {
2739 		kunmap(page);
2740 		page_cache_release(page);
2741 	}
2742 	return res;
2743 }
2744 
2745 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2746 {
2747 	struct page *page = NULL;
2748 	nd_set_link(nd, page_getlink(dentry, &page));
2749 	return page;
2750 }
2751 
2752 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2753 {
2754 	struct page *page = cookie;
2755 
2756 	if (page) {
2757 		kunmap(page);
2758 		page_cache_release(page);
2759 	}
2760 }
2761 
2762 int __page_symlink(struct inode *inode, const char *symname, int len,
2763 		gfp_t gfp_mask)
2764 {
2765 	struct address_space *mapping = inode->i_mapping;
2766 	struct page *page;
2767 	void *fsdata;
2768 	int err;
2769 	char *kaddr;
2770 
2771 retry:
2772 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
2773 				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
2774 	if (err)
2775 		goto fail;
2776 
2777 	kaddr = kmap_atomic(page, KM_USER0);
2778 	memcpy(kaddr, symname, len-1);
2779 	kunmap_atomic(kaddr, KM_USER0);
2780 
2781 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2782 							page, fsdata);
2783 	if (err < 0)
2784 		goto fail;
2785 	if (err < len-1)
2786 		goto retry;
2787 
2788 	mark_inode_dirty(inode);
2789 	return 0;
2790 fail:
2791 	return err;
2792 }
2793 
2794 int page_symlink(struct inode *inode, const char *symname, int len)
2795 {
2796 	return __page_symlink(inode, symname, len,
2797 			mapping_gfp_mask(inode->i_mapping));
2798 }
2799 
2800 const struct inode_operations page_symlink_inode_operations = {
2801 	.readlink	= generic_readlink,
2802 	.follow_link	= page_follow_link_light,
2803 	.put_link	= page_put_link,
2804 };
2805 
2806 EXPORT_SYMBOL(__user_walk);
2807 EXPORT_SYMBOL(__user_walk_fd);
2808 EXPORT_SYMBOL(follow_down);
2809 EXPORT_SYMBOL(follow_up);
2810 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2811 EXPORT_SYMBOL(getname);
2812 EXPORT_SYMBOL(lock_rename);
2813 EXPORT_SYMBOL(lookup_one_len);
2814 EXPORT_SYMBOL(page_follow_link_light);
2815 EXPORT_SYMBOL(page_put_link);
2816 EXPORT_SYMBOL(page_readlink);
2817 EXPORT_SYMBOL(__page_symlink);
2818 EXPORT_SYMBOL(page_symlink);
2819 EXPORT_SYMBOL(page_symlink_inode_operations);
2820 EXPORT_SYMBOL(path_lookup);
2821 EXPORT_SYMBOL(vfs_path_lookup);
2822 EXPORT_SYMBOL(permission);
2823 EXPORT_SYMBOL(vfs_permission);
2824 EXPORT_SYMBOL(file_permission);
2825 EXPORT_SYMBOL(unlock_rename);
2826 EXPORT_SYMBOL(vfs_create);
2827 EXPORT_SYMBOL(vfs_follow_link);
2828 EXPORT_SYMBOL(vfs_link);
2829 EXPORT_SYMBOL(vfs_mkdir);
2830 EXPORT_SYMBOL(vfs_mknod);
2831 EXPORT_SYMBOL(generic_permission);
2832 EXPORT_SYMBOL(vfs_readlink);
2833 EXPORT_SYMBOL(vfs_rename);
2834 EXPORT_SYMBOL(vfs_rmdir);
2835 EXPORT_SYMBOL(vfs_symlink);
2836 EXPORT_SYMBOL(vfs_unlink);
2837 EXPORT_SYMBOL(dentry_unhash);
2838 EXPORT_SYMBOL(generic_readlink);
2839