xref: /openbmc/linux/fs/namei.c (revision 554a8b9f54cd7ca2b89f5dc227df08be082fae0d)
1 /*
2  *  linux/fs/namei.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 /*
8  * Some corrections by tytso.
9  */
10 
11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12  * lookup logic.
13  */
14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15  */
16 
17 #include <linux/init.h>
18 #include <linux/module.h>
19 #include <linux/slab.h>
20 #include <linux/fs.h>
21 #include <linux/namei.h>
22 #include <linux/pagemap.h>
23 #include <linux/fsnotify.h>
24 #include <linux/personality.h>
25 #include <linux/security.h>
26 #include <linux/ima.h>
27 #include <linux/syscalls.h>
28 #include <linux/mount.h>
29 #include <linux/audit.h>
30 #include <linux/capability.h>
31 #include <linux/file.h>
32 #include <linux/fcntl.h>
33 #include <linux/device_cgroup.h>
34 #include <linux/fs_struct.h>
35 #include <asm/uaccess.h>
36 
37 #include "internal.h"
38 
39 /* [Feb-1997 T. Schoebel-Theuer]
40  * Fundamental changes in the pathname lookup mechanisms (namei)
41  * were necessary because of omirr.  The reason is that omirr needs
42  * to know the _real_ pathname, not the user-supplied one, in case
43  * of symlinks (and also when transname replacements occur).
44  *
45  * The new code replaces the old recursive symlink resolution with
46  * an iterative one (in case of non-nested symlink chains).  It does
47  * this with calls to <fs>_follow_link().
48  * As a side effect, dir_namei(), _namei() and follow_link() are now
49  * replaced with a single function lookup_dentry() that can handle all
50  * the special cases of the former code.
51  *
52  * With the new dcache, the pathname is stored at each inode, at least as
53  * long as the refcount of the inode is positive.  As a side effect, the
54  * size of the dcache depends on the inode cache and thus is dynamic.
55  *
56  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
57  * resolution to correspond with current state of the code.
58  *
59  * Note that the symlink resolution is not *completely* iterative.
60  * There is still a significant amount of tail- and mid- recursion in
61  * the algorithm.  Also, note that <fs>_readlink() is not used in
62  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
63  * may return different results than <fs>_follow_link().  Many virtual
64  * filesystems (including /proc) exhibit this behavior.
65  */
66 
67 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
68  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
69  * and the name already exists in form of a symlink, try to create the new
70  * name indicated by the symlink. The old code always complained that the
71  * name already exists, due to not following the symlink even if its target
72  * is nonexistent.  The new semantics affects also mknod() and link() when
73  * the name is a symlink pointing to a non-existent name.
74  *
75  * I don't know which semantics is the right one, since I have no access
76  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
77  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
78  * "old" one. Personally, I think the new semantics is much more logical.
79  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
80  * file does succeed in both HP-UX and SunOs, but not in Solaris
81  * and in the old Linux semantics.
82  */
83 
84 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
85  * semantics.  See the comments in "open_namei" and "do_link" below.
86  *
87  * [10-Sep-98 Alan Modra] Another symlink change.
88  */
89 
90 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
91  *	inside the path - always follow.
92  *	in the last component in creation/removal/renaming - never follow.
93  *	if LOOKUP_FOLLOW passed - follow.
94  *	if the pathname has trailing slashes - follow.
95  *	otherwise - don't follow.
96  * (applied in that order).
97  *
98  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
99  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
100  * During the 2.4 we need to fix the userland stuff depending on it -
101  * hopefully we will be able to get rid of that wart in 2.5. So far only
102  * XEmacs seems to be relying on it...
103  */
104 /*
105  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
106  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
107  * any extra contention...
108  */
109 
110 /* In order to reduce some races, while at the same time doing additional
111  * checking and hopefully speeding things up, we copy filenames to the
112  * kernel data space before using them..
113  *
114  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
115  * PATH_MAX includes the nul terminator --RR.
116  */
117 static int do_getname(const char __user *filename, char *page)
118 {
119 	int retval;
120 	unsigned long len = PATH_MAX;
121 
122 	if (!segment_eq(get_fs(), KERNEL_DS)) {
123 		if ((unsigned long) filename >= TASK_SIZE)
124 			return -EFAULT;
125 		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
126 			len = TASK_SIZE - (unsigned long) filename;
127 	}
128 
129 	retval = strncpy_from_user(page, filename, len);
130 	if (retval > 0) {
131 		if (retval < len)
132 			return 0;
133 		return -ENAMETOOLONG;
134 	} else if (!retval)
135 		retval = -ENOENT;
136 	return retval;
137 }
138 
139 static char *getname_flags(const char __user * filename, int flags)
140 {
141 	char *tmp, *result;
142 
143 	result = ERR_PTR(-ENOMEM);
144 	tmp = __getname();
145 	if (tmp)  {
146 		int retval = do_getname(filename, tmp);
147 
148 		result = tmp;
149 		if (retval < 0) {
150 			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
151 				__putname(tmp);
152 				result = ERR_PTR(retval);
153 			}
154 		}
155 	}
156 	audit_getname(result);
157 	return result;
158 }
159 
160 char *getname(const char __user * filename)
161 {
162 	return getname_flags(filename, 0);
163 }
164 
165 #ifdef CONFIG_AUDITSYSCALL
166 void putname(const char *name)
167 {
168 	if (unlikely(!audit_dummy_context()))
169 		audit_putname(name);
170 	else
171 		__putname(name);
172 }
173 EXPORT_SYMBOL(putname);
174 #endif
175 
176 /*
177  * This does basic POSIX ACL permission checking
178  */
179 static int acl_permission_check(struct inode *inode, int mask)
180 {
181 	int (*check_acl)(struct inode *inode, int mask);
182 	unsigned int mode = inode->i_mode;
183 
184 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
185 
186 	if (current_user_ns() != inode_userns(inode))
187 		goto other_perms;
188 
189 	if (current_fsuid() == inode->i_uid)
190 		mode >>= 6;
191 	else {
192 		check_acl = inode->i_op->check_acl;
193 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
194 			int error = check_acl(inode, mask);
195 			if (error != -EAGAIN)
196 				return error;
197 		}
198 
199 		if (in_group_p(inode->i_gid))
200 			mode >>= 3;
201 	}
202 
203 other_perms:
204 	/*
205 	 * If the DACs are ok we don't need any capability check.
206 	 */
207 	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
208 		return 0;
209 	return -EACCES;
210 }
211 
212 /**
213  * generic_permission -  check for access rights on a Posix-like filesystem
214  * @inode:	inode to check access rights for
215  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
216  * @flags:	IPERM_FLAG_ flags.
217  *
218  * Used to check for read/write/execute permissions on a file.
219  * We use "fsuid" for this, letting us set arbitrary permissions
220  * for filesystem access without changing the "normal" uids which
221  * are used for other things.
222  *
223  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
224  * request cannot be satisfied (eg. requires blocking or too much complexity).
225  * It would then be called again in ref-walk mode.
226  */
227 int generic_permission(struct inode *inode, int mask)
228 {
229 	int ret;
230 
231 	/*
232 	 * Do the basic POSIX ACL permission checks.
233 	 */
234 	ret = acl_permission_check(inode, mask);
235 	if (ret != -EACCES)
236 		return ret;
237 
238 	if (S_ISDIR(inode->i_mode)) {
239 		/* DACs are overridable for directories */
240 		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
241 			return 0;
242 		if (!(mask & MAY_WRITE))
243 			if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
244 				return 0;
245 		return -EACCES;
246 	}
247 	/*
248 	 * Read/write DACs are always overridable.
249 	 * Executable DACs are overridable when there is
250 	 * at least one exec bit set.
251 	 */
252 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
253 		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
254 			return 0;
255 
256 	/*
257 	 * Searching includes executable on directories, else just read.
258 	 */
259 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
260 	if (mask == MAY_READ)
261 		if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
262 			return 0;
263 
264 	return -EACCES;
265 }
266 
267 /**
268  * inode_permission  -  check for access rights to a given inode
269  * @inode:	inode to check permission on
270  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
271  *
272  * Used to check for read/write/execute permissions on an inode.
273  * We use "fsuid" for this, letting us set arbitrary permissions
274  * for filesystem access without changing the "normal" uids which
275  * are used for other things.
276  */
277 int inode_permission(struct inode *inode, int mask)
278 {
279 	int retval;
280 
281 	if (mask & MAY_WRITE) {
282 		umode_t mode = inode->i_mode;
283 
284 		/*
285 		 * Nobody gets write access to a read-only fs.
286 		 */
287 		if (IS_RDONLY(inode) &&
288 		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
289 			return -EROFS;
290 
291 		/*
292 		 * Nobody gets write access to an immutable file.
293 		 */
294 		if (IS_IMMUTABLE(inode))
295 			return -EACCES;
296 	}
297 
298 	if (inode->i_op->permission)
299 		retval = inode->i_op->permission(inode, mask);
300 	else
301 		retval = generic_permission(inode, mask);
302 
303 	if (retval)
304 		return retval;
305 
306 	retval = devcgroup_inode_permission(inode, mask);
307 	if (retval)
308 		return retval;
309 
310 	return security_inode_permission(inode, mask);
311 }
312 
313 /**
314  * path_get - get a reference to a path
315  * @path: path to get the reference to
316  *
317  * Given a path increment the reference count to the dentry and the vfsmount.
318  */
319 void path_get(struct path *path)
320 {
321 	mntget(path->mnt);
322 	dget(path->dentry);
323 }
324 EXPORT_SYMBOL(path_get);
325 
326 /**
327  * path_put - put a reference to a path
328  * @path: path to put the reference to
329  *
330  * Given a path decrement the reference count to the dentry and the vfsmount.
331  */
332 void path_put(struct path *path)
333 {
334 	dput(path->dentry);
335 	mntput(path->mnt);
336 }
337 EXPORT_SYMBOL(path_put);
338 
339 /*
340  * Path walking has 2 modes, rcu-walk and ref-walk (see
341  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
342  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
343  * normal reference counts on dentries and vfsmounts to transition to rcu-walk
344  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
345  * got stuck, so ref-walk may continue from there. If this is not successful
346  * (eg. a seqcount has changed), then failure is returned and it's up to caller
347  * to restart the path walk from the beginning in ref-walk mode.
348  */
349 
350 /**
351  * unlazy_walk - try to switch to ref-walk mode.
352  * @nd: nameidata pathwalk data
353  * @dentry: child of nd->path.dentry or NULL
354  * Returns: 0 on success, -ECHILD on failure
355  *
356  * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
357  * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
358  * @nd or NULL.  Must be called from rcu-walk context.
359  */
360 static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
361 {
362 	struct fs_struct *fs = current->fs;
363 	struct dentry *parent = nd->path.dentry;
364 	int want_root = 0;
365 
366 	BUG_ON(!(nd->flags & LOOKUP_RCU));
367 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
368 		want_root = 1;
369 		spin_lock(&fs->lock);
370 		if (nd->root.mnt != fs->root.mnt ||
371 				nd->root.dentry != fs->root.dentry)
372 			goto err_root;
373 	}
374 	spin_lock(&parent->d_lock);
375 	if (!dentry) {
376 		if (!__d_rcu_to_refcount(parent, nd->seq))
377 			goto err_parent;
378 		BUG_ON(nd->inode != parent->d_inode);
379 	} else {
380 		if (dentry->d_parent != parent)
381 			goto err_parent;
382 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
383 		if (!__d_rcu_to_refcount(dentry, nd->seq))
384 			goto err_child;
385 		/*
386 		 * If the sequence check on the child dentry passed, then
387 		 * the child has not been removed from its parent. This
388 		 * means the parent dentry must be valid and able to take
389 		 * a reference at this point.
390 		 */
391 		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
392 		BUG_ON(!parent->d_count);
393 		parent->d_count++;
394 		spin_unlock(&dentry->d_lock);
395 	}
396 	spin_unlock(&parent->d_lock);
397 	if (want_root) {
398 		path_get(&nd->root);
399 		spin_unlock(&fs->lock);
400 	}
401 	mntget(nd->path.mnt);
402 
403 	rcu_read_unlock();
404 	br_read_unlock(vfsmount_lock);
405 	nd->flags &= ~LOOKUP_RCU;
406 	return 0;
407 
408 err_child:
409 	spin_unlock(&dentry->d_lock);
410 err_parent:
411 	spin_unlock(&parent->d_lock);
412 err_root:
413 	if (want_root)
414 		spin_unlock(&fs->lock);
415 	return -ECHILD;
416 }
417 
418 /**
419  * release_open_intent - free up open intent resources
420  * @nd: pointer to nameidata
421  */
422 void release_open_intent(struct nameidata *nd)
423 {
424 	struct file *file = nd->intent.open.file;
425 
426 	if (file && !IS_ERR(file)) {
427 		if (file->f_path.dentry == NULL)
428 			put_filp(file);
429 		else
430 			fput(file);
431 	}
432 }
433 
434 static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
435 {
436 	return dentry->d_op->d_revalidate(dentry, nd);
437 }
438 
439 /**
440  * complete_walk - successful completion of path walk
441  * @nd:  pointer nameidata
442  *
443  * If we had been in RCU mode, drop out of it and legitimize nd->path.
444  * Revalidate the final result, unless we'd already done that during
445  * the path walk or the filesystem doesn't ask for it.  Return 0 on
446  * success, -error on failure.  In case of failure caller does not
447  * need to drop nd->path.
448  */
449 static int complete_walk(struct nameidata *nd)
450 {
451 	struct dentry *dentry = nd->path.dentry;
452 	int status;
453 
454 	if (nd->flags & LOOKUP_RCU) {
455 		nd->flags &= ~LOOKUP_RCU;
456 		if (!(nd->flags & LOOKUP_ROOT))
457 			nd->root.mnt = NULL;
458 		spin_lock(&dentry->d_lock);
459 		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
460 			spin_unlock(&dentry->d_lock);
461 			rcu_read_unlock();
462 			br_read_unlock(vfsmount_lock);
463 			return -ECHILD;
464 		}
465 		BUG_ON(nd->inode != dentry->d_inode);
466 		spin_unlock(&dentry->d_lock);
467 		mntget(nd->path.mnt);
468 		rcu_read_unlock();
469 		br_read_unlock(vfsmount_lock);
470 	}
471 
472 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
473 		return 0;
474 
475 	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
476 		return 0;
477 
478 	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
479 		return 0;
480 
481 	/* Note: we do not d_invalidate() */
482 	status = d_revalidate(dentry, nd);
483 	if (status > 0)
484 		return 0;
485 
486 	if (!status)
487 		status = -ESTALE;
488 
489 	path_put(&nd->path);
490 	return status;
491 }
492 
493 static __always_inline void set_root(struct nameidata *nd)
494 {
495 	if (!nd->root.mnt)
496 		get_fs_root(current->fs, &nd->root);
497 }
498 
499 static int link_path_walk(const char *, struct nameidata *);
500 
501 static __always_inline void set_root_rcu(struct nameidata *nd)
502 {
503 	if (!nd->root.mnt) {
504 		struct fs_struct *fs = current->fs;
505 		unsigned seq;
506 
507 		do {
508 			seq = read_seqcount_begin(&fs->seq);
509 			nd->root = fs->root;
510 			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
511 		} while (read_seqcount_retry(&fs->seq, seq));
512 	}
513 }
514 
515 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
516 {
517 	int ret;
518 
519 	if (IS_ERR(link))
520 		goto fail;
521 
522 	if (*link == '/') {
523 		set_root(nd);
524 		path_put(&nd->path);
525 		nd->path = nd->root;
526 		path_get(&nd->root);
527 		nd->flags |= LOOKUP_JUMPED;
528 	}
529 	nd->inode = nd->path.dentry->d_inode;
530 
531 	ret = link_path_walk(link, nd);
532 	return ret;
533 fail:
534 	path_put(&nd->path);
535 	return PTR_ERR(link);
536 }
537 
538 static void path_put_conditional(struct path *path, struct nameidata *nd)
539 {
540 	dput(path->dentry);
541 	if (path->mnt != nd->path.mnt)
542 		mntput(path->mnt);
543 }
544 
545 static inline void path_to_nameidata(const struct path *path,
546 					struct nameidata *nd)
547 {
548 	if (!(nd->flags & LOOKUP_RCU)) {
549 		dput(nd->path.dentry);
550 		if (nd->path.mnt != path->mnt)
551 			mntput(nd->path.mnt);
552 	}
553 	nd->path.mnt = path->mnt;
554 	nd->path.dentry = path->dentry;
555 }
556 
557 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
558 {
559 	struct inode *inode = link->dentry->d_inode;
560 	if (!IS_ERR(cookie) && inode->i_op->put_link)
561 		inode->i_op->put_link(link->dentry, nd, cookie);
562 	path_put(link);
563 }
564 
565 static __always_inline int
566 follow_link(struct path *link, struct nameidata *nd, void **p)
567 {
568 	int error;
569 	struct dentry *dentry = link->dentry;
570 
571 	BUG_ON(nd->flags & LOOKUP_RCU);
572 
573 	if (link->mnt == nd->path.mnt)
574 		mntget(link->mnt);
575 
576 	if (unlikely(current->total_link_count >= 40)) {
577 		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
578 		path_put(&nd->path);
579 		return -ELOOP;
580 	}
581 	cond_resched();
582 	current->total_link_count++;
583 
584 	touch_atime(link->mnt, dentry);
585 	nd_set_link(nd, NULL);
586 
587 	error = security_inode_follow_link(link->dentry, nd);
588 	if (error) {
589 		*p = ERR_PTR(error); /* no ->put_link(), please */
590 		path_put(&nd->path);
591 		return error;
592 	}
593 
594 	nd->last_type = LAST_BIND;
595 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
596 	error = PTR_ERR(*p);
597 	if (!IS_ERR(*p)) {
598 		char *s = nd_get_link(nd);
599 		error = 0;
600 		if (s)
601 			error = __vfs_follow_link(nd, s);
602 		else if (nd->last_type == LAST_BIND) {
603 			nd->flags |= LOOKUP_JUMPED;
604 			nd->inode = nd->path.dentry->d_inode;
605 			if (nd->inode->i_op->follow_link) {
606 				/* stepped on a _really_ weird one */
607 				path_put(&nd->path);
608 				error = -ELOOP;
609 			}
610 		}
611 	}
612 	return error;
613 }
614 
615 static int follow_up_rcu(struct path *path)
616 {
617 	struct vfsmount *parent;
618 	struct dentry *mountpoint;
619 
620 	parent = path->mnt->mnt_parent;
621 	if (parent == path->mnt)
622 		return 0;
623 	mountpoint = path->mnt->mnt_mountpoint;
624 	path->dentry = mountpoint;
625 	path->mnt = parent;
626 	return 1;
627 }
628 
629 int follow_up(struct path *path)
630 {
631 	struct vfsmount *parent;
632 	struct dentry *mountpoint;
633 
634 	br_read_lock(vfsmount_lock);
635 	parent = path->mnt->mnt_parent;
636 	if (parent == path->mnt) {
637 		br_read_unlock(vfsmount_lock);
638 		return 0;
639 	}
640 	mntget(parent);
641 	mountpoint = dget(path->mnt->mnt_mountpoint);
642 	br_read_unlock(vfsmount_lock);
643 	dput(path->dentry);
644 	path->dentry = mountpoint;
645 	mntput(path->mnt);
646 	path->mnt = parent;
647 	return 1;
648 }
649 
650 /*
651  * Perform an automount
652  * - return -EISDIR to tell follow_managed() to stop and return the path we
653  *   were called with.
654  */
655 static int follow_automount(struct path *path, unsigned flags,
656 			    bool *need_mntput)
657 {
658 	struct vfsmount *mnt;
659 	int err;
660 
661 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
662 		return -EREMOTE;
663 
664 	/* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
665 	 * and this is the terminal part of the path.
666 	 */
667 	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
668 		return -EISDIR; /* we actually want to stop here */
669 
670 	/* We want to mount if someone is trying to open/create a file of any
671 	 * type under the mountpoint, wants to traverse through the mountpoint
672 	 * or wants to open the mounted directory.
673 	 *
674 	 * We don't want to mount if someone's just doing a stat and they've
675 	 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
676 	 * appended a '/' to the name.
677 	 */
678 	if (!(flags & LOOKUP_FOLLOW) &&
679 	    !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
680 		       LOOKUP_OPEN | LOOKUP_CREATE)))
681 		return -EISDIR;
682 
683 	current->total_link_count++;
684 	if (current->total_link_count >= 40)
685 		return -ELOOP;
686 
687 	mnt = path->dentry->d_op->d_automount(path);
688 	if (IS_ERR(mnt)) {
689 		/*
690 		 * The filesystem is allowed to return -EISDIR here to indicate
691 		 * it doesn't want to automount.  For instance, autofs would do
692 		 * this so that its userspace daemon can mount on this dentry.
693 		 *
694 		 * However, we can only permit this if it's a terminal point in
695 		 * the path being looked up; if it wasn't then the remainder of
696 		 * the path is inaccessible and we should say so.
697 		 */
698 		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
699 			return -EREMOTE;
700 		return PTR_ERR(mnt);
701 	}
702 
703 	if (!mnt) /* mount collision */
704 		return 0;
705 
706 	if (!*need_mntput) {
707 		/* lock_mount() may release path->mnt on error */
708 		mntget(path->mnt);
709 		*need_mntput = true;
710 	}
711 	err = finish_automount(mnt, path);
712 
713 	switch (err) {
714 	case -EBUSY:
715 		/* Someone else made a mount here whilst we were busy */
716 		return 0;
717 	case 0:
718 		path_put(path);
719 		path->mnt = mnt;
720 		path->dentry = dget(mnt->mnt_root);
721 		return 0;
722 	default:
723 		return err;
724 	}
725 
726 }
727 
728 /*
729  * Handle a dentry that is managed in some way.
730  * - Flagged for transit management (autofs)
731  * - Flagged as mountpoint
732  * - Flagged as automount point
733  *
734  * This may only be called in refwalk mode.
735  *
736  * Serialization is taken care of in namespace.c
737  */
738 static int follow_managed(struct path *path, unsigned flags)
739 {
740 	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
741 	unsigned managed;
742 	bool need_mntput = false;
743 	int ret = 0;
744 
745 	/* Given that we're not holding a lock here, we retain the value in a
746 	 * local variable for each dentry as we look at it so that we don't see
747 	 * the components of that value change under us */
748 	while (managed = ACCESS_ONCE(path->dentry->d_flags),
749 	       managed &= DCACHE_MANAGED_DENTRY,
750 	       unlikely(managed != 0)) {
751 		/* Allow the filesystem to manage the transit without i_mutex
752 		 * being held. */
753 		if (managed & DCACHE_MANAGE_TRANSIT) {
754 			BUG_ON(!path->dentry->d_op);
755 			BUG_ON(!path->dentry->d_op->d_manage);
756 			ret = path->dentry->d_op->d_manage(path->dentry, false);
757 			if (ret < 0)
758 				break;
759 		}
760 
761 		/* Transit to a mounted filesystem. */
762 		if (managed & DCACHE_MOUNTED) {
763 			struct vfsmount *mounted = lookup_mnt(path);
764 			if (mounted) {
765 				dput(path->dentry);
766 				if (need_mntput)
767 					mntput(path->mnt);
768 				path->mnt = mounted;
769 				path->dentry = dget(mounted->mnt_root);
770 				need_mntput = true;
771 				continue;
772 			}
773 
774 			/* Something is mounted on this dentry in another
775 			 * namespace and/or whatever was mounted there in this
776 			 * namespace got unmounted before we managed to get the
777 			 * vfsmount_lock */
778 		}
779 
780 		/* Handle an automount point */
781 		if (managed & DCACHE_NEED_AUTOMOUNT) {
782 			ret = follow_automount(path, flags, &need_mntput);
783 			if (ret < 0)
784 				break;
785 			continue;
786 		}
787 
788 		/* We didn't change the current path point */
789 		break;
790 	}
791 
792 	if (need_mntput && path->mnt == mnt)
793 		mntput(path->mnt);
794 	if (ret == -EISDIR)
795 		ret = 0;
796 	return ret;
797 }
798 
799 int follow_down_one(struct path *path)
800 {
801 	struct vfsmount *mounted;
802 
803 	mounted = lookup_mnt(path);
804 	if (mounted) {
805 		dput(path->dentry);
806 		mntput(path->mnt);
807 		path->mnt = mounted;
808 		path->dentry = dget(mounted->mnt_root);
809 		return 1;
810 	}
811 	return 0;
812 }
813 
814 static inline bool managed_dentry_might_block(struct dentry *dentry)
815 {
816 	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
817 		dentry->d_op->d_manage(dentry, true) < 0);
818 }
819 
820 /*
821  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
822  * we meet a managed dentry that would need blocking.
823  */
824 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
825 			       struct inode **inode)
826 {
827 	for (;;) {
828 		struct vfsmount *mounted;
829 		/*
830 		 * Don't forget we might have a non-mountpoint managed dentry
831 		 * that wants to block transit.
832 		 */
833 		if (unlikely(managed_dentry_might_block(path->dentry)))
834 			return false;
835 
836 		if (!d_mountpoint(path->dentry))
837 			break;
838 
839 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
840 		if (!mounted)
841 			break;
842 		path->mnt = mounted;
843 		path->dentry = mounted->mnt_root;
844 		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
845 		/*
846 		 * Update the inode too. We don't need to re-check the
847 		 * dentry sequence number here after this d_inode read,
848 		 * because a mount-point is always pinned.
849 		 */
850 		*inode = path->dentry->d_inode;
851 	}
852 	return true;
853 }
854 
855 static void follow_mount_rcu(struct nameidata *nd)
856 {
857 	while (d_mountpoint(nd->path.dentry)) {
858 		struct vfsmount *mounted;
859 		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
860 		if (!mounted)
861 			break;
862 		nd->path.mnt = mounted;
863 		nd->path.dentry = mounted->mnt_root;
864 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
865 	}
866 }
867 
868 static int follow_dotdot_rcu(struct nameidata *nd)
869 {
870 	set_root_rcu(nd);
871 
872 	while (1) {
873 		if (nd->path.dentry == nd->root.dentry &&
874 		    nd->path.mnt == nd->root.mnt) {
875 			break;
876 		}
877 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
878 			struct dentry *old = nd->path.dentry;
879 			struct dentry *parent = old->d_parent;
880 			unsigned seq;
881 
882 			seq = read_seqcount_begin(&parent->d_seq);
883 			if (read_seqcount_retry(&old->d_seq, nd->seq))
884 				goto failed;
885 			nd->path.dentry = parent;
886 			nd->seq = seq;
887 			break;
888 		}
889 		if (!follow_up_rcu(&nd->path))
890 			break;
891 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
892 	}
893 	follow_mount_rcu(nd);
894 	nd->inode = nd->path.dentry->d_inode;
895 	return 0;
896 
897 failed:
898 	nd->flags &= ~LOOKUP_RCU;
899 	if (!(nd->flags & LOOKUP_ROOT))
900 		nd->root.mnt = NULL;
901 	rcu_read_unlock();
902 	br_read_unlock(vfsmount_lock);
903 	return -ECHILD;
904 }
905 
906 /*
907  * Follow down to the covering mount currently visible to userspace.  At each
908  * point, the filesystem owning that dentry may be queried as to whether the
909  * caller is permitted to proceed or not.
910  */
911 int follow_down(struct path *path)
912 {
913 	unsigned managed;
914 	int ret;
915 
916 	while (managed = ACCESS_ONCE(path->dentry->d_flags),
917 	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
918 		/* Allow the filesystem to manage the transit without i_mutex
919 		 * being held.
920 		 *
921 		 * We indicate to the filesystem if someone is trying to mount
922 		 * something here.  This gives autofs the chance to deny anyone
923 		 * other than its daemon the right to mount on its
924 		 * superstructure.
925 		 *
926 		 * The filesystem may sleep at this point.
927 		 */
928 		if (managed & DCACHE_MANAGE_TRANSIT) {
929 			BUG_ON(!path->dentry->d_op);
930 			BUG_ON(!path->dentry->d_op->d_manage);
931 			ret = path->dentry->d_op->d_manage(
932 				path->dentry, false);
933 			if (ret < 0)
934 				return ret == -EISDIR ? 0 : ret;
935 		}
936 
937 		/* Transit to a mounted filesystem. */
938 		if (managed & DCACHE_MOUNTED) {
939 			struct vfsmount *mounted = lookup_mnt(path);
940 			if (!mounted)
941 				break;
942 			dput(path->dentry);
943 			mntput(path->mnt);
944 			path->mnt = mounted;
945 			path->dentry = dget(mounted->mnt_root);
946 			continue;
947 		}
948 
949 		/* Don't handle automount points here */
950 		break;
951 	}
952 	return 0;
953 }
954 
955 /*
956  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
957  */
958 static void follow_mount(struct path *path)
959 {
960 	while (d_mountpoint(path->dentry)) {
961 		struct vfsmount *mounted = lookup_mnt(path);
962 		if (!mounted)
963 			break;
964 		dput(path->dentry);
965 		mntput(path->mnt);
966 		path->mnt = mounted;
967 		path->dentry = dget(mounted->mnt_root);
968 	}
969 }
970 
971 static void follow_dotdot(struct nameidata *nd)
972 {
973 	set_root(nd);
974 
975 	while(1) {
976 		struct dentry *old = nd->path.dentry;
977 
978 		if (nd->path.dentry == nd->root.dentry &&
979 		    nd->path.mnt == nd->root.mnt) {
980 			break;
981 		}
982 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
983 			/* rare case of legitimate dget_parent()... */
984 			nd->path.dentry = dget_parent(nd->path.dentry);
985 			dput(old);
986 			break;
987 		}
988 		if (!follow_up(&nd->path))
989 			break;
990 	}
991 	follow_mount(&nd->path);
992 	nd->inode = nd->path.dentry->d_inode;
993 }
994 
995 /*
996  * Allocate a dentry with name and parent, and perform a parent
997  * directory ->lookup on it. Returns the new dentry, or ERR_PTR
998  * on error. parent->d_inode->i_mutex must be held. d_lookup must
999  * have verified that no child exists while under i_mutex.
1000  */
1001 static struct dentry *d_alloc_and_lookup(struct dentry *parent,
1002 				struct qstr *name, struct nameidata *nd)
1003 {
1004 	struct inode *inode = parent->d_inode;
1005 	struct dentry *dentry;
1006 	struct dentry *old;
1007 
1008 	/* Don't create child dentry for a dead directory. */
1009 	if (unlikely(IS_DEADDIR(inode)))
1010 		return ERR_PTR(-ENOENT);
1011 
1012 	dentry = d_alloc(parent, name);
1013 	if (unlikely(!dentry))
1014 		return ERR_PTR(-ENOMEM);
1015 
1016 	old = inode->i_op->lookup(inode, dentry, nd);
1017 	if (unlikely(old)) {
1018 		dput(dentry);
1019 		dentry = old;
1020 	}
1021 	return dentry;
1022 }
1023 
1024 /*
1025  * We already have a dentry, but require a lookup to be performed on the parent
1026  * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
1027  * parent->d_inode->i_mutex must be held. d_lookup must have verified that no
1028  * child exists while under i_mutex.
1029  */
1030 static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry,
1031 				     struct nameidata *nd)
1032 {
1033 	struct inode *inode = parent->d_inode;
1034 	struct dentry *old;
1035 
1036 	/* Don't create child dentry for a dead directory. */
1037 	if (unlikely(IS_DEADDIR(inode)))
1038 		return ERR_PTR(-ENOENT);
1039 
1040 	old = inode->i_op->lookup(inode, dentry, nd);
1041 	if (unlikely(old)) {
1042 		dput(dentry);
1043 		dentry = old;
1044 	}
1045 	return dentry;
1046 }
1047 
1048 /*
1049  *  It's more convoluted than I'd like it to be, but... it's still fairly
1050  *  small and for now I'd prefer to have fast path as straight as possible.
1051  *  It _is_ time-critical.
1052  */
1053 static int do_lookup(struct nameidata *nd, struct qstr *name,
1054 			struct path *path, struct inode **inode)
1055 {
1056 	struct vfsmount *mnt = nd->path.mnt;
1057 	struct dentry *dentry, *parent = nd->path.dentry;
1058 	int need_reval = 1;
1059 	int status = 1;
1060 	int err;
1061 
1062 	/*
1063 	 * Rename seqlock is not required here because in the off chance
1064 	 * of a false negative due to a concurrent rename, we're going to
1065 	 * do the non-racy lookup, below.
1066 	 */
1067 	if (nd->flags & LOOKUP_RCU) {
1068 		unsigned seq;
1069 		*inode = nd->inode;
1070 		dentry = __d_lookup_rcu(parent, name, &seq, inode);
1071 		if (!dentry)
1072 			goto unlazy;
1073 
1074 		/* Memory barrier in read_seqcount_begin of child is enough */
1075 		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1076 			return -ECHILD;
1077 		nd->seq = seq;
1078 
1079 		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1080 			status = d_revalidate(dentry, nd);
1081 			if (unlikely(status <= 0)) {
1082 				if (status != -ECHILD)
1083 					need_reval = 0;
1084 				goto unlazy;
1085 			}
1086 		}
1087 		if (unlikely(d_need_lookup(dentry)))
1088 			goto unlazy;
1089 		path->mnt = mnt;
1090 		path->dentry = dentry;
1091 		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1092 			goto unlazy;
1093 		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1094 			goto unlazy;
1095 		return 0;
1096 unlazy:
1097 		if (unlazy_walk(nd, dentry))
1098 			return -ECHILD;
1099 	} else {
1100 		dentry = __d_lookup(parent, name);
1101 	}
1102 
1103 	if (dentry && unlikely(d_need_lookup(dentry))) {
1104 		dput(dentry);
1105 		dentry = NULL;
1106 	}
1107 retry:
1108 	if (unlikely(!dentry)) {
1109 		struct inode *dir = parent->d_inode;
1110 		BUG_ON(nd->inode != dir);
1111 
1112 		mutex_lock(&dir->i_mutex);
1113 		dentry = d_lookup(parent, name);
1114 		if (likely(!dentry)) {
1115 			dentry = d_alloc_and_lookup(parent, name, nd);
1116 			if (IS_ERR(dentry)) {
1117 				mutex_unlock(&dir->i_mutex);
1118 				return PTR_ERR(dentry);
1119 			}
1120 			/* known good */
1121 			need_reval = 0;
1122 			status = 1;
1123 		} else if (unlikely(d_need_lookup(dentry))) {
1124 			dentry = d_inode_lookup(parent, dentry, nd);
1125 			if (IS_ERR(dentry)) {
1126 				mutex_unlock(&dir->i_mutex);
1127 				return PTR_ERR(dentry);
1128 			}
1129 			/* known good */
1130 			need_reval = 0;
1131 			status = 1;
1132 		}
1133 		mutex_unlock(&dir->i_mutex);
1134 	}
1135 	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1136 		status = d_revalidate(dentry, nd);
1137 	if (unlikely(status <= 0)) {
1138 		if (status < 0) {
1139 			dput(dentry);
1140 			return status;
1141 		}
1142 		if (!d_invalidate(dentry)) {
1143 			dput(dentry);
1144 			dentry = NULL;
1145 			need_reval = 1;
1146 			goto retry;
1147 		}
1148 	}
1149 
1150 	path->mnt = mnt;
1151 	path->dentry = dentry;
1152 	err = follow_managed(path, nd->flags);
1153 	if (unlikely(err < 0)) {
1154 		path_put_conditional(path, nd);
1155 		return err;
1156 	}
1157 	*inode = path->dentry->d_inode;
1158 	return 0;
1159 }
1160 
1161 static inline int may_lookup(struct nameidata *nd)
1162 {
1163 	if (nd->flags & LOOKUP_RCU) {
1164 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1165 		if (err != -ECHILD)
1166 			return err;
1167 		if (unlazy_walk(nd, NULL))
1168 			return -ECHILD;
1169 	}
1170 	return inode_permission(nd->inode, MAY_EXEC);
1171 }
1172 
1173 static inline int handle_dots(struct nameidata *nd, int type)
1174 {
1175 	if (type == LAST_DOTDOT) {
1176 		if (nd->flags & LOOKUP_RCU) {
1177 			if (follow_dotdot_rcu(nd))
1178 				return -ECHILD;
1179 		} else
1180 			follow_dotdot(nd);
1181 	}
1182 	return 0;
1183 }
1184 
1185 static void terminate_walk(struct nameidata *nd)
1186 {
1187 	if (!(nd->flags & LOOKUP_RCU)) {
1188 		path_put(&nd->path);
1189 	} else {
1190 		nd->flags &= ~LOOKUP_RCU;
1191 		if (!(nd->flags & LOOKUP_ROOT))
1192 			nd->root.mnt = NULL;
1193 		rcu_read_unlock();
1194 		br_read_unlock(vfsmount_lock);
1195 	}
1196 }
1197 
1198 static inline int walk_component(struct nameidata *nd, struct path *path,
1199 		struct qstr *name, int type, int follow)
1200 {
1201 	struct inode *inode;
1202 	int err;
1203 	/*
1204 	 * "." and ".." are special - ".." especially so because it has
1205 	 * to be able to know about the current root directory and
1206 	 * parent relationships.
1207 	 */
1208 	if (unlikely(type != LAST_NORM))
1209 		return handle_dots(nd, type);
1210 	err = do_lookup(nd, name, path, &inode);
1211 	if (unlikely(err)) {
1212 		terminate_walk(nd);
1213 		return err;
1214 	}
1215 	if (!inode) {
1216 		path_to_nameidata(path, nd);
1217 		terminate_walk(nd);
1218 		return -ENOENT;
1219 	}
1220 	if (unlikely(inode->i_op->follow_link) && follow) {
1221 		if (nd->flags & LOOKUP_RCU) {
1222 			if (unlikely(unlazy_walk(nd, path->dentry))) {
1223 				terminate_walk(nd);
1224 				return -ECHILD;
1225 			}
1226 		}
1227 		BUG_ON(inode != path->dentry->d_inode);
1228 		return 1;
1229 	}
1230 	path_to_nameidata(path, nd);
1231 	nd->inode = inode;
1232 	return 0;
1233 }
1234 
1235 /*
1236  * This limits recursive symlink follows to 8, while
1237  * limiting consecutive symlinks to 40.
1238  *
1239  * Without that kind of total limit, nasty chains of consecutive
1240  * symlinks can cause almost arbitrarily long lookups.
1241  */
1242 static inline int nested_symlink(struct path *path, struct nameidata *nd)
1243 {
1244 	int res;
1245 
1246 	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1247 		path_put_conditional(path, nd);
1248 		path_put(&nd->path);
1249 		return -ELOOP;
1250 	}
1251 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1252 
1253 	nd->depth++;
1254 	current->link_count++;
1255 
1256 	do {
1257 		struct path link = *path;
1258 		void *cookie;
1259 
1260 		res = follow_link(&link, nd, &cookie);
1261 		if (!res)
1262 			res = walk_component(nd, path, &nd->last,
1263 					     nd->last_type, LOOKUP_FOLLOW);
1264 		put_link(nd, &link, cookie);
1265 	} while (res > 0);
1266 
1267 	current->link_count--;
1268 	nd->depth--;
1269 	return res;
1270 }
1271 
1272 /*
1273  * Name resolution.
1274  * This is the basic name resolution function, turning a pathname into
1275  * the final dentry. We expect 'base' to be positive and a directory.
1276  *
1277  * Returns 0 and nd will have valid dentry and mnt on success.
1278  * Returns error and drops reference to input namei data on failure.
1279  */
1280 static int link_path_walk(const char *name, struct nameidata *nd)
1281 {
1282 	struct path next;
1283 	int err;
1284 	unsigned int lookup_flags = nd->flags;
1285 
1286 	while (*name=='/')
1287 		name++;
1288 	if (!*name)
1289 		return 0;
1290 
1291 	/* At this point we know we have a real path component. */
1292 	for(;;) {
1293 		unsigned long hash;
1294 		struct qstr this;
1295 		unsigned int c;
1296 		int type;
1297 
1298 		nd->flags |= LOOKUP_CONTINUE;
1299 
1300 		err = may_lookup(nd);
1301  		if (err)
1302 			break;
1303 
1304 		this.name = name;
1305 		c = *(const unsigned char *)name;
1306 
1307 		hash = init_name_hash();
1308 		do {
1309 			name++;
1310 			hash = partial_name_hash(c, hash);
1311 			c = *(const unsigned char *)name;
1312 		} while (c && (c != '/'));
1313 		this.len = name - (const char *) this.name;
1314 		this.hash = end_name_hash(hash);
1315 
1316 		type = LAST_NORM;
1317 		if (this.name[0] == '.') switch (this.len) {
1318 			case 2:
1319 				if (this.name[1] == '.') {
1320 					type = LAST_DOTDOT;
1321 					nd->flags |= LOOKUP_JUMPED;
1322 				}
1323 				break;
1324 			case 1:
1325 				type = LAST_DOT;
1326 		}
1327 		if (likely(type == LAST_NORM)) {
1328 			struct dentry *parent = nd->path.dentry;
1329 			nd->flags &= ~LOOKUP_JUMPED;
1330 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1331 				err = parent->d_op->d_hash(parent, nd->inode,
1332 							   &this);
1333 				if (err < 0)
1334 					break;
1335 			}
1336 		}
1337 
1338 		/* remove trailing slashes? */
1339 		if (!c)
1340 			goto last_component;
1341 		while (*++name == '/');
1342 		if (!*name)
1343 			goto last_component;
1344 
1345 		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
1346 		if (err < 0)
1347 			return err;
1348 
1349 		if (err) {
1350 			err = nested_symlink(&next, nd);
1351 			if (err)
1352 				return err;
1353 		}
1354 		err = -ENOTDIR;
1355 		if (!nd->inode->i_op->lookup)
1356 			break;
1357 		continue;
1358 		/* here ends the main loop */
1359 
1360 last_component:
1361 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
1362 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
1363 		nd->last = this;
1364 		nd->last_type = type;
1365 		return 0;
1366 	}
1367 	terminate_walk(nd);
1368 	return err;
1369 }
1370 
1371 static int path_init(int dfd, const char *name, unsigned int flags,
1372 		     struct nameidata *nd, struct file **fp)
1373 {
1374 	int retval = 0;
1375 	int fput_needed;
1376 	struct file *file;
1377 
1378 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1379 	nd->flags = flags | LOOKUP_JUMPED;
1380 	nd->depth = 0;
1381 	if (flags & LOOKUP_ROOT) {
1382 		struct inode *inode = nd->root.dentry->d_inode;
1383 		if (*name) {
1384 			if (!inode->i_op->lookup)
1385 				return -ENOTDIR;
1386 			retval = inode_permission(inode, MAY_EXEC);
1387 			if (retval)
1388 				return retval;
1389 		}
1390 		nd->path = nd->root;
1391 		nd->inode = inode;
1392 		if (flags & LOOKUP_RCU) {
1393 			br_read_lock(vfsmount_lock);
1394 			rcu_read_lock();
1395 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1396 		} else {
1397 			path_get(&nd->path);
1398 		}
1399 		return 0;
1400 	}
1401 
1402 	nd->root.mnt = NULL;
1403 
1404 	if (*name=='/') {
1405 		if (flags & LOOKUP_RCU) {
1406 			br_read_lock(vfsmount_lock);
1407 			rcu_read_lock();
1408 			set_root_rcu(nd);
1409 		} else {
1410 			set_root(nd);
1411 			path_get(&nd->root);
1412 		}
1413 		nd->path = nd->root;
1414 	} else if (dfd == AT_FDCWD) {
1415 		if (flags & LOOKUP_RCU) {
1416 			struct fs_struct *fs = current->fs;
1417 			unsigned seq;
1418 
1419 			br_read_lock(vfsmount_lock);
1420 			rcu_read_lock();
1421 
1422 			do {
1423 				seq = read_seqcount_begin(&fs->seq);
1424 				nd->path = fs->pwd;
1425 				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1426 			} while (read_seqcount_retry(&fs->seq, seq));
1427 		} else {
1428 			get_fs_pwd(current->fs, &nd->path);
1429 		}
1430 	} else {
1431 		struct dentry *dentry;
1432 
1433 		file = fget_raw_light(dfd, &fput_needed);
1434 		retval = -EBADF;
1435 		if (!file)
1436 			goto out_fail;
1437 
1438 		dentry = file->f_path.dentry;
1439 
1440 		if (*name) {
1441 			retval = -ENOTDIR;
1442 			if (!S_ISDIR(dentry->d_inode->i_mode))
1443 				goto fput_fail;
1444 
1445 			retval = inode_permission(dentry->d_inode, MAY_EXEC);
1446 			if (retval)
1447 				goto fput_fail;
1448 		}
1449 
1450 		nd->path = file->f_path;
1451 		if (flags & LOOKUP_RCU) {
1452 			if (fput_needed)
1453 				*fp = file;
1454 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1455 			br_read_lock(vfsmount_lock);
1456 			rcu_read_lock();
1457 		} else {
1458 			path_get(&file->f_path);
1459 			fput_light(file, fput_needed);
1460 		}
1461 	}
1462 
1463 	nd->inode = nd->path.dentry->d_inode;
1464 	return 0;
1465 
1466 fput_fail:
1467 	fput_light(file, fput_needed);
1468 out_fail:
1469 	return retval;
1470 }
1471 
1472 static inline int lookup_last(struct nameidata *nd, struct path *path)
1473 {
1474 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1475 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1476 
1477 	nd->flags &= ~LOOKUP_PARENT;
1478 	return walk_component(nd, path, &nd->last, nd->last_type,
1479 					nd->flags & LOOKUP_FOLLOW);
1480 }
1481 
1482 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1483 static int path_lookupat(int dfd, const char *name,
1484 				unsigned int flags, struct nameidata *nd)
1485 {
1486 	struct file *base = NULL;
1487 	struct path path;
1488 	int err;
1489 
1490 	/*
1491 	 * Path walking is largely split up into 2 different synchronisation
1492 	 * schemes, rcu-walk and ref-walk (explained in
1493 	 * Documentation/filesystems/path-lookup.txt). These share much of the
1494 	 * path walk code, but some things particularly setup, cleanup, and
1495 	 * following mounts are sufficiently divergent that functions are
1496 	 * duplicated. Typically there is a function foo(), and its RCU
1497 	 * analogue, foo_rcu().
1498 	 *
1499 	 * -ECHILD is the error number of choice (just to avoid clashes) that
1500 	 * is returned if some aspect of an rcu-walk fails. Such an error must
1501 	 * be handled by restarting a traditional ref-walk (which will always
1502 	 * be able to complete).
1503 	 */
1504 	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1505 
1506 	if (unlikely(err))
1507 		return err;
1508 
1509 	current->total_link_count = 0;
1510 	err = link_path_walk(name, nd);
1511 
1512 	if (!err && !(flags & LOOKUP_PARENT)) {
1513 		err = lookup_last(nd, &path);
1514 		while (err > 0) {
1515 			void *cookie;
1516 			struct path link = path;
1517 			nd->flags |= LOOKUP_PARENT;
1518 			err = follow_link(&link, nd, &cookie);
1519 			if (!err)
1520 				err = lookup_last(nd, &path);
1521 			put_link(nd, &link, cookie);
1522 		}
1523 	}
1524 
1525 	if (!err)
1526 		err = complete_walk(nd);
1527 
1528 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
1529 		if (!nd->inode->i_op->lookup) {
1530 			path_put(&nd->path);
1531 			err = -ENOTDIR;
1532 		}
1533 	}
1534 
1535 	if (base)
1536 		fput(base);
1537 
1538 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
1539 		path_put(&nd->root);
1540 		nd->root.mnt = NULL;
1541 	}
1542 	return err;
1543 }
1544 
1545 static int do_path_lookup(int dfd, const char *name,
1546 				unsigned int flags, struct nameidata *nd)
1547 {
1548 	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
1549 	if (unlikely(retval == -ECHILD))
1550 		retval = path_lookupat(dfd, name, flags, nd);
1551 	if (unlikely(retval == -ESTALE))
1552 		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
1553 
1554 	if (likely(!retval)) {
1555 		if (unlikely(!audit_dummy_context())) {
1556 			if (nd->path.dentry && nd->inode)
1557 				audit_inode(name, nd->path.dentry);
1558 		}
1559 	}
1560 	return retval;
1561 }
1562 
1563 int kern_path_parent(const char *name, struct nameidata *nd)
1564 {
1565 	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
1566 }
1567 
1568 int kern_path(const char *name, unsigned int flags, struct path *path)
1569 {
1570 	struct nameidata nd;
1571 	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
1572 	if (!res)
1573 		*path = nd.path;
1574 	return res;
1575 }
1576 
1577 /**
1578  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1579  * @dentry:  pointer to dentry of the base directory
1580  * @mnt: pointer to vfs mount of the base directory
1581  * @name: pointer to file name
1582  * @flags: lookup flags
1583  * @nd: pointer to nameidata
1584  */
1585 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1586 		    const char *name, unsigned int flags,
1587 		    struct nameidata *nd)
1588 {
1589 	nd->root.dentry = dentry;
1590 	nd->root.mnt = mnt;
1591 	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1592 	return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
1593 }
1594 
1595 static struct dentry *__lookup_hash(struct qstr *name,
1596 		struct dentry *base, struct nameidata *nd)
1597 {
1598 	struct inode *inode = base->d_inode;
1599 	struct dentry *dentry;
1600 	int err;
1601 
1602 	err = inode_permission(inode, MAY_EXEC);
1603 	if (err)
1604 		return ERR_PTR(err);
1605 
1606 	/*
1607 	 * Don't bother with __d_lookup: callers are for creat as
1608 	 * well as unlink, so a lot of the time it would cost
1609 	 * a double lookup.
1610 	 */
1611 	dentry = d_lookup(base, name);
1612 
1613 	if (dentry && d_need_lookup(dentry)) {
1614 		/*
1615 		 * __lookup_hash is called with the parent dir's i_mutex already
1616 		 * held, so we are good to go here.
1617 		 */
1618 		dentry = d_inode_lookup(base, dentry, nd);
1619 		if (IS_ERR(dentry))
1620 			return dentry;
1621 	}
1622 
1623 	if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1624 		int status = d_revalidate(dentry, nd);
1625 		if (unlikely(status <= 0)) {
1626 			/*
1627 			 * The dentry failed validation.
1628 			 * If d_revalidate returned 0 attempt to invalidate
1629 			 * the dentry otherwise d_revalidate is asking us
1630 			 * to return a fail status.
1631 			 */
1632 			if (status < 0) {
1633 				dput(dentry);
1634 				return ERR_PTR(status);
1635 			} else if (!d_invalidate(dentry)) {
1636 				dput(dentry);
1637 				dentry = NULL;
1638 			}
1639 		}
1640 	}
1641 
1642 	if (!dentry)
1643 		dentry = d_alloc_and_lookup(base, name, nd);
1644 
1645 	return dentry;
1646 }
1647 
1648 /*
1649  * Restricted form of lookup. Doesn't follow links, single-component only,
1650  * needs parent already locked. Doesn't follow mounts.
1651  * SMP-safe.
1652  */
1653 static struct dentry *lookup_hash(struct nameidata *nd)
1654 {
1655 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
1656 }
1657 
1658 /**
1659  * lookup_one_len - filesystem helper to lookup single pathname component
1660  * @name:	pathname component to lookup
1661  * @base:	base directory to lookup from
1662  * @len:	maximum length @len should be interpreted to
1663  *
1664  * Note that this routine is purely a helper for filesystem usage and should
1665  * not be called by generic code.  Also note that by using this function the
1666  * nameidata argument is passed to the filesystem methods and a filesystem
1667  * using this helper needs to be prepared for that.
1668  */
1669 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1670 {
1671 	struct qstr this;
1672 	unsigned long hash;
1673 	unsigned int c;
1674 
1675 	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1676 
1677 	this.name = name;
1678 	this.len = len;
1679 	if (!len)
1680 		return ERR_PTR(-EACCES);
1681 
1682 	hash = init_name_hash();
1683 	while (len--) {
1684 		c = *(const unsigned char *)name++;
1685 		if (c == '/' || c == '\0')
1686 			return ERR_PTR(-EACCES);
1687 		hash = partial_name_hash(c, hash);
1688 	}
1689 	this.hash = end_name_hash(hash);
1690 	/*
1691 	 * See if the low-level filesystem might want
1692 	 * to use its own hash..
1693 	 */
1694 	if (base->d_flags & DCACHE_OP_HASH) {
1695 		int err = base->d_op->d_hash(base, base->d_inode, &this);
1696 		if (err < 0)
1697 			return ERR_PTR(err);
1698 	}
1699 
1700 	return __lookup_hash(&this, base, NULL);
1701 }
1702 
1703 int user_path_at(int dfd, const char __user *name, unsigned flags,
1704 		 struct path *path)
1705 {
1706 	struct nameidata nd;
1707 	char *tmp = getname_flags(name, flags);
1708 	int err = PTR_ERR(tmp);
1709 	if (!IS_ERR(tmp)) {
1710 
1711 		BUG_ON(flags & LOOKUP_PARENT);
1712 
1713 		err = do_path_lookup(dfd, tmp, flags, &nd);
1714 		putname(tmp);
1715 		if (!err)
1716 			*path = nd.path;
1717 	}
1718 	return err;
1719 }
1720 
1721 static int user_path_parent(int dfd, const char __user *path,
1722 			struct nameidata *nd, char **name)
1723 {
1724 	char *s = getname(path);
1725 	int error;
1726 
1727 	if (IS_ERR(s))
1728 		return PTR_ERR(s);
1729 
1730 	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
1731 	if (error)
1732 		putname(s);
1733 	else
1734 		*name = s;
1735 
1736 	return error;
1737 }
1738 
1739 /*
1740  * It's inline, so penalty for filesystems that don't use sticky bit is
1741  * minimal.
1742  */
1743 static inline int check_sticky(struct inode *dir, struct inode *inode)
1744 {
1745 	uid_t fsuid = current_fsuid();
1746 
1747 	if (!(dir->i_mode & S_ISVTX))
1748 		return 0;
1749 	if (current_user_ns() != inode_userns(inode))
1750 		goto other_userns;
1751 	if (inode->i_uid == fsuid)
1752 		return 0;
1753 	if (dir->i_uid == fsuid)
1754 		return 0;
1755 
1756 other_userns:
1757 	return !ns_capable(inode_userns(inode), CAP_FOWNER);
1758 }
1759 
1760 /*
1761  *	Check whether we can remove a link victim from directory dir, check
1762  *  whether the type of victim is right.
1763  *  1. We can't do it if dir is read-only (done in permission())
1764  *  2. We should have write and exec permissions on dir
1765  *  3. We can't remove anything from append-only dir
1766  *  4. We can't do anything with immutable dir (done in permission())
1767  *  5. If the sticky bit on dir is set we should either
1768  *	a. be owner of dir, or
1769  *	b. be owner of victim, or
1770  *	c. have CAP_FOWNER capability
1771  *  6. If the victim is append-only or immutable we can't do antyhing with
1772  *     links pointing to it.
1773  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1774  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1775  *  9. We can't remove a root or mountpoint.
1776  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1777  *     nfs_async_unlink().
1778  */
1779 static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1780 {
1781 	int error;
1782 
1783 	if (!victim->d_inode)
1784 		return -ENOENT;
1785 
1786 	BUG_ON(victim->d_parent->d_inode != dir);
1787 	audit_inode_child(victim, dir);
1788 
1789 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1790 	if (error)
1791 		return error;
1792 	if (IS_APPEND(dir))
1793 		return -EPERM;
1794 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1795 	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
1796 		return -EPERM;
1797 	if (isdir) {
1798 		if (!S_ISDIR(victim->d_inode->i_mode))
1799 			return -ENOTDIR;
1800 		if (IS_ROOT(victim))
1801 			return -EBUSY;
1802 	} else if (S_ISDIR(victim->d_inode->i_mode))
1803 		return -EISDIR;
1804 	if (IS_DEADDIR(dir))
1805 		return -ENOENT;
1806 	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1807 		return -EBUSY;
1808 	return 0;
1809 }
1810 
1811 /*	Check whether we can create an object with dentry child in directory
1812  *  dir.
1813  *  1. We can't do it if child already exists (open has special treatment for
1814  *     this case, but since we are inlined it's OK)
1815  *  2. We can't do it if dir is read-only (done in permission())
1816  *  3. We should have write and exec permissions on dir
1817  *  4. We can't do it if dir is immutable (done in permission())
1818  */
1819 static inline int may_create(struct inode *dir, struct dentry *child)
1820 {
1821 	if (child->d_inode)
1822 		return -EEXIST;
1823 	if (IS_DEADDIR(dir))
1824 		return -ENOENT;
1825 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1826 }
1827 
1828 /*
1829  * p1 and p2 should be directories on the same fs.
1830  */
1831 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1832 {
1833 	struct dentry *p;
1834 
1835 	if (p1 == p2) {
1836 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1837 		return NULL;
1838 	}
1839 
1840 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1841 
1842 	p = d_ancestor(p2, p1);
1843 	if (p) {
1844 		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1845 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1846 		return p;
1847 	}
1848 
1849 	p = d_ancestor(p1, p2);
1850 	if (p) {
1851 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1852 		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1853 		return p;
1854 	}
1855 
1856 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1857 	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1858 	return NULL;
1859 }
1860 
1861 void unlock_rename(struct dentry *p1, struct dentry *p2)
1862 {
1863 	mutex_unlock(&p1->d_inode->i_mutex);
1864 	if (p1 != p2) {
1865 		mutex_unlock(&p2->d_inode->i_mutex);
1866 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1867 	}
1868 }
1869 
1870 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1871 		struct nameidata *nd)
1872 {
1873 	int error = may_create(dir, dentry);
1874 
1875 	if (error)
1876 		return error;
1877 
1878 	if (!dir->i_op->create)
1879 		return -EACCES;	/* shouldn't it be ENOSYS? */
1880 	mode &= S_IALLUGO;
1881 	mode |= S_IFREG;
1882 	error = security_inode_create(dir, dentry, mode);
1883 	if (error)
1884 		return error;
1885 	error = dir->i_op->create(dir, dentry, mode, nd);
1886 	if (!error)
1887 		fsnotify_create(dir, dentry);
1888 	return error;
1889 }
1890 
1891 static int may_open(struct path *path, int acc_mode, int flag)
1892 {
1893 	struct dentry *dentry = path->dentry;
1894 	struct inode *inode = dentry->d_inode;
1895 	int error;
1896 
1897 	/* O_PATH? */
1898 	if (!acc_mode)
1899 		return 0;
1900 
1901 	if (!inode)
1902 		return -ENOENT;
1903 
1904 	switch (inode->i_mode & S_IFMT) {
1905 	case S_IFLNK:
1906 		return -ELOOP;
1907 	case S_IFDIR:
1908 		if (acc_mode & MAY_WRITE)
1909 			return -EISDIR;
1910 		break;
1911 	case S_IFBLK:
1912 	case S_IFCHR:
1913 		if (path->mnt->mnt_flags & MNT_NODEV)
1914 			return -EACCES;
1915 		/*FALLTHRU*/
1916 	case S_IFIFO:
1917 	case S_IFSOCK:
1918 		flag &= ~O_TRUNC;
1919 		break;
1920 	}
1921 
1922 	error = inode_permission(inode, acc_mode);
1923 	if (error)
1924 		return error;
1925 
1926 	/*
1927 	 * An append-only file must be opened in append mode for writing.
1928 	 */
1929 	if (IS_APPEND(inode)) {
1930 		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1931 			return -EPERM;
1932 		if (flag & O_TRUNC)
1933 			return -EPERM;
1934 	}
1935 
1936 	/* O_NOATIME can only be set by the owner or superuser */
1937 	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
1938 		return -EPERM;
1939 
1940 	/*
1941 	 * Ensure there are no outstanding leases on the file.
1942 	 */
1943 	return break_lease(inode, flag);
1944 }
1945 
1946 static int handle_truncate(struct file *filp)
1947 {
1948 	struct path *path = &filp->f_path;
1949 	struct inode *inode = path->dentry->d_inode;
1950 	int error = get_write_access(inode);
1951 	if (error)
1952 		return error;
1953 	/*
1954 	 * Refuse to truncate files with mandatory locks held on them.
1955 	 */
1956 	error = locks_verify_locked(inode);
1957 	if (!error)
1958 		error = security_path_truncate(path);
1959 	if (!error) {
1960 		error = do_truncate(path->dentry, 0,
1961 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1962 				    filp);
1963 	}
1964 	put_write_access(inode);
1965 	return error;
1966 }
1967 
1968 /*
1969  * Note that while the flag value (low two bits) for sys_open means:
1970  *	00 - read-only
1971  *	01 - write-only
1972  *	10 - read-write
1973  *	11 - special
1974  * it is changed into
1975  *	00 - no permissions needed
1976  *	01 - read-permission
1977  *	10 - write-permission
1978  *	11 - read-write
1979  * for the internal routines (ie open_namei()/follow_link() etc)
1980  * This is more logical, and also allows the 00 "no perm needed"
1981  * to be used for symlinks (where the permissions are checked
1982  * later).
1983  *
1984 */
1985 static inline int open_to_namei_flags(int flag)
1986 {
1987 	if ((flag+1) & O_ACCMODE)
1988 		flag++;
1989 	return flag;
1990 }
1991 
1992 /*
1993  * Handle the last step of open()
1994  */
1995 static struct file *do_last(struct nameidata *nd, struct path *path,
1996 			    const struct open_flags *op, const char *pathname)
1997 {
1998 	struct dentry *dir = nd->path.dentry;
1999 	struct dentry *dentry;
2000 	int open_flag = op->open_flag;
2001 	int will_truncate = open_flag & O_TRUNC;
2002 	int want_write = 0;
2003 	int acc_mode = op->acc_mode;
2004 	struct file *filp;
2005 	int error;
2006 
2007 	nd->flags &= ~LOOKUP_PARENT;
2008 	nd->flags |= op->intent;
2009 
2010 	switch (nd->last_type) {
2011 	case LAST_DOTDOT:
2012 	case LAST_DOT:
2013 		error = handle_dots(nd, nd->last_type);
2014 		if (error)
2015 			return ERR_PTR(error);
2016 		/* fallthrough */
2017 	case LAST_ROOT:
2018 		error = complete_walk(nd);
2019 		if (error)
2020 			return ERR_PTR(error);
2021 		audit_inode(pathname, nd->path.dentry);
2022 		if (open_flag & O_CREAT) {
2023 			error = -EISDIR;
2024 			goto exit;
2025 		}
2026 		goto ok;
2027 	case LAST_BIND:
2028 		error = complete_walk(nd);
2029 		if (error)
2030 			return ERR_PTR(error);
2031 		audit_inode(pathname, dir);
2032 		goto ok;
2033 	}
2034 
2035 	if (!(open_flag & O_CREAT)) {
2036 		int symlink_ok = 0;
2037 		if (nd->last.name[nd->last.len])
2038 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2039 		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2040 			symlink_ok = 1;
2041 		/* we _can_ be in RCU mode here */
2042 		error = walk_component(nd, path, &nd->last, LAST_NORM,
2043 					!symlink_ok);
2044 		if (error < 0)
2045 			return ERR_PTR(error);
2046 		if (error) /* symlink */
2047 			return NULL;
2048 		/* sayonara */
2049 		error = complete_walk(nd);
2050 		if (error)
2051 			return ERR_PTR(-ECHILD);
2052 
2053 		error = -ENOTDIR;
2054 		if (nd->flags & LOOKUP_DIRECTORY) {
2055 			if (!nd->inode->i_op->lookup)
2056 				goto exit;
2057 		}
2058 		audit_inode(pathname, nd->path.dentry);
2059 		goto ok;
2060 	}
2061 
2062 	/* create side of things */
2063 	error = complete_walk(nd);
2064 	if (error)
2065 		return ERR_PTR(error);
2066 
2067 	audit_inode(pathname, dir);
2068 	error = -EISDIR;
2069 	/* trailing slashes? */
2070 	if (nd->last.name[nd->last.len])
2071 		goto exit;
2072 
2073 	mutex_lock(&dir->d_inode->i_mutex);
2074 
2075 	dentry = lookup_hash(nd);
2076 	error = PTR_ERR(dentry);
2077 	if (IS_ERR(dentry)) {
2078 		mutex_unlock(&dir->d_inode->i_mutex);
2079 		goto exit;
2080 	}
2081 
2082 	path->dentry = dentry;
2083 	path->mnt = nd->path.mnt;
2084 
2085 	/* Negative dentry, just create the file */
2086 	if (!dentry->d_inode) {
2087 		int mode = op->mode;
2088 		if (!IS_POSIXACL(dir->d_inode))
2089 			mode &= ~current_umask();
2090 		/*
2091 		 * This write is needed to ensure that a
2092 		 * rw->ro transition does not occur between
2093 		 * the time when the file is created and when
2094 		 * a permanent write count is taken through
2095 		 * the 'struct file' in nameidata_to_filp().
2096 		 */
2097 		error = mnt_want_write(nd->path.mnt);
2098 		if (error)
2099 			goto exit_mutex_unlock;
2100 		want_write = 1;
2101 		/* Don't check for write permission, don't truncate */
2102 		open_flag &= ~O_TRUNC;
2103 		will_truncate = 0;
2104 		acc_mode = MAY_OPEN;
2105 		error = security_path_mknod(&nd->path, dentry, mode, 0);
2106 		if (error)
2107 			goto exit_mutex_unlock;
2108 		error = vfs_create(dir->d_inode, dentry, mode, nd);
2109 		if (error)
2110 			goto exit_mutex_unlock;
2111 		mutex_unlock(&dir->d_inode->i_mutex);
2112 		dput(nd->path.dentry);
2113 		nd->path.dentry = dentry;
2114 		goto common;
2115 	}
2116 
2117 	/*
2118 	 * It already exists.
2119 	 */
2120 	mutex_unlock(&dir->d_inode->i_mutex);
2121 	audit_inode(pathname, path->dentry);
2122 
2123 	error = -EEXIST;
2124 	if (open_flag & O_EXCL)
2125 		goto exit_dput;
2126 
2127 	error = follow_managed(path, nd->flags);
2128 	if (error < 0)
2129 		goto exit_dput;
2130 
2131 	error = -ENOENT;
2132 	if (!path->dentry->d_inode)
2133 		goto exit_dput;
2134 
2135 	if (path->dentry->d_inode->i_op->follow_link)
2136 		return NULL;
2137 
2138 	path_to_nameidata(path, nd);
2139 	nd->inode = path->dentry->d_inode;
2140 	error = -EISDIR;
2141 	if (S_ISDIR(nd->inode->i_mode))
2142 		goto exit;
2143 ok:
2144 	if (!S_ISREG(nd->inode->i_mode))
2145 		will_truncate = 0;
2146 
2147 	if (will_truncate) {
2148 		error = mnt_want_write(nd->path.mnt);
2149 		if (error)
2150 			goto exit;
2151 		want_write = 1;
2152 	}
2153 common:
2154 	error = may_open(&nd->path, acc_mode, open_flag);
2155 	if (error)
2156 		goto exit;
2157 	filp = nameidata_to_filp(nd);
2158 	if (!IS_ERR(filp)) {
2159 		error = ima_file_check(filp, op->acc_mode);
2160 		if (error) {
2161 			fput(filp);
2162 			filp = ERR_PTR(error);
2163 		}
2164 	}
2165 	if (!IS_ERR(filp)) {
2166 		if (will_truncate) {
2167 			error = handle_truncate(filp);
2168 			if (error) {
2169 				fput(filp);
2170 				filp = ERR_PTR(error);
2171 			}
2172 		}
2173 	}
2174 out:
2175 	if (want_write)
2176 		mnt_drop_write(nd->path.mnt);
2177 	path_put(&nd->path);
2178 	return filp;
2179 
2180 exit_mutex_unlock:
2181 	mutex_unlock(&dir->d_inode->i_mutex);
2182 exit_dput:
2183 	path_put_conditional(path, nd);
2184 exit:
2185 	filp = ERR_PTR(error);
2186 	goto out;
2187 }
2188 
2189 static struct file *path_openat(int dfd, const char *pathname,
2190 		struct nameidata *nd, const struct open_flags *op, int flags)
2191 {
2192 	struct file *base = NULL;
2193 	struct file *filp;
2194 	struct path path;
2195 	int error;
2196 
2197 	filp = get_empty_filp();
2198 	if (!filp)
2199 		return ERR_PTR(-ENFILE);
2200 
2201 	filp->f_flags = op->open_flag;
2202 	nd->intent.open.file = filp;
2203 	nd->intent.open.flags = open_to_namei_flags(op->open_flag);
2204 	nd->intent.open.create_mode = op->mode;
2205 
2206 	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
2207 	if (unlikely(error))
2208 		goto out_filp;
2209 
2210 	current->total_link_count = 0;
2211 	error = link_path_walk(pathname, nd);
2212 	if (unlikely(error))
2213 		goto out_filp;
2214 
2215 	filp = do_last(nd, &path, op, pathname);
2216 	while (unlikely(!filp)) { /* trailing symlink */
2217 		struct path link = path;
2218 		void *cookie;
2219 		if (!(nd->flags & LOOKUP_FOLLOW)) {
2220 			path_put_conditional(&path, nd);
2221 			path_put(&nd->path);
2222 			filp = ERR_PTR(-ELOOP);
2223 			break;
2224 		}
2225 		nd->flags |= LOOKUP_PARENT;
2226 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
2227 		error = follow_link(&link, nd, &cookie);
2228 		if (unlikely(error))
2229 			filp = ERR_PTR(error);
2230 		else
2231 			filp = do_last(nd, &path, op, pathname);
2232 		put_link(nd, &link, cookie);
2233 	}
2234 out:
2235 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
2236 		path_put(&nd->root);
2237 	if (base)
2238 		fput(base);
2239 	release_open_intent(nd);
2240 	return filp;
2241 
2242 out_filp:
2243 	filp = ERR_PTR(error);
2244 	goto out;
2245 }
2246 
2247 struct file *do_filp_open(int dfd, const char *pathname,
2248 		const struct open_flags *op, int flags)
2249 {
2250 	struct nameidata nd;
2251 	struct file *filp;
2252 
2253 	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
2254 	if (unlikely(filp == ERR_PTR(-ECHILD)))
2255 		filp = path_openat(dfd, pathname, &nd, op, flags);
2256 	if (unlikely(filp == ERR_PTR(-ESTALE)))
2257 		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
2258 	return filp;
2259 }
2260 
2261 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2262 		const char *name, const struct open_flags *op, int flags)
2263 {
2264 	struct nameidata nd;
2265 	struct file *file;
2266 
2267 	nd.root.mnt = mnt;
2268 	nd.root.dentry = dentry;
2269 
2270 	flags |= LOOKUP_ROOT;
2271 
2272 	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2273 		return ERR_PTR(-ELOOP);
2274 
2275 	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
2276 	if (unlikely(file == ERR_PTR(-ECHILD)))
2277 		file = path_openat(-1, name, &nd, op, flags);
2278 	if (unlikely(file == ERR_PTR(-ESTALE)))
2279 		file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
2280 	return file;
2281 }
2282 
2283 /**
2284  * lookup_create - lookup a dentry, creating it if it doesn't exist
2285  * @nd: nameidata info
2286  * @is_dir: directory flag
2287  *
2288  * Simple function to lookup and return a dentry and create it
2289  * if it doesn't exist.  Is SMP-safe.
2290  *
2291  * Returns with nd->path.dentry->d_inode->i_mutex locked.
2292  */
2293 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
2294 {
2295 	struct dentry *dentry = ERR_PTR(-EEXIST);
2296 
2297 	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2298 	/*
2299 	 * Yucky last component or no last component at all?
2300 	 * (foo/., foo/.., /////)
2301 	 */
2302 	if (nd->last_type != LAST_NORM)
2303 		goto fail;
2304 	nd->flags &= ~LOOKUP_PARENT;
2305 	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
2306 	nd->intent.open.flags = O_EXCL;
2307 
2308 	/*
2309 	 * Do the final lookup.
2310 	 */
2311 	dentry = lookup_hash(nd);
2312 	if (IS_ERR(dentry))
2313 		goto fail;
2314 
2315 	if (dentry->d_inode)
2316 		goto eexist;
2317 	/*
2318 	 * Special case - lookup gave negative, but... we had foo/bar/
2319 	 * From the vfs_mknod() POV we just have a negative dentry -
2320 	 * all is fine. Let's be bastards - you had / on the end, you've
2321 	 * been asking for (non-existent) directory. -ENOENT for you.
2322 	 */
2323 	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
2324 		dput(dentry);
2325 		dentry = ERR_PTR(-ENOENT);
2326 	}
2327 	return dentry;
2328 eexist:
2329 	dput(dentry);
2330 	dentry = ERR_PTR(-EEXIST);
2331 fail:
2332 	return dentry;
2333 }
2334 EXPORT_SYMBOL_GPL(lookup_create);
2335 
2336 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
2337 {
2338 	int error = may_create(dir, dentry);
2339 
2340 	if (error)
2341 		return error;
2342 
2343 	if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
2344 	    !ns_capable(inode_userns(dir), CAP_MKNOD))
2345 		return -EPERM;
2346 
2347 	if (!dir->i_op->mknod)
2348 		return -EPERM;
2349 
2350 	error = devcgroup_inode_mknod(mode, dev);
2351 	if (error)
2352 		return error;
2353 
2354 	error = security_inode_mknod(dir, dentry, mode, dev);
2355 	if (error)
2356 		return error;
2357 
2358 	error = dir->i_op->mknod(dir, dentry, mode, dev);
2359 	if (!error)
2360 		fsnotify_create(dir, dentry);
2361 	return error;
2362 }
2363 
2364 static int may_mknod(mode_t mode)
2365 {
2366 	switch (mode & S_IFMT) {
2367 	case S_IFREG:
2368 	case S_IFCHR:
2369 	case S_IFBLK:
2370 	case S_IFIFO:
2371 	case S_IFSOCK:
2372 	case 0: /* zero mode translates to S_IFREG */
2373 		return 0;
2374 	case S_IFDIR:
2375 		return -EPERM;
2376 	default:
2377 		return -EINVAL;
2378 	}
2379 }
2380 
2381 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
2382 		unsigned, dev)
2383 {
2384 	int error;
2385 	char *tmp;
2386 	struct dentry *dentry;
2387 	struct nameidata nd;
2388 
2389 	if (S_ISDIR(mode))
2390 		return -EPERM;
2391 
2392 	error = user_path_parent(dfd, filename, &nd, &tmp);
2393 	if (error)
2394 		return error;
2395 
2396 	dentry = lookup_create(&nd, 0);
2397 	if (IS_ERR(dentry)) {
2398 		error = PTR_ERR(dentry);
2399 		goto out_unlock;
2400 	}
2401 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2402 		mode &= ~current_umask();
2403 	error = may_mknod(mode);
2404 	if (error)
2405 		goto out_dput;
2406 	error = mnt_want_write(nd.path.mnt);
2407 	if (error)
2408 		goto out_dput;
2409 	error = security_path_mknod(&nd.path, dentry, mode, dev);
2410 	if (error)
2411 		goto out_drop_write;
2412 	switch (mode & S_IFMT) {
2413 		case 0: case S_IFREG:
2414 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,NULL);
2415 			break;
2416 		case S_IFCHR: case S_IFBLK:
2417 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
2418 					new_decode_dev(dev));
2419 			break;
2420 		case S_IFIFO: case S_IFSOCK:
2421 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2422 			break;
2423 	}
2424 out_drop_write:
2425 	mnt_drop_write(nd.path.mnt);
2426 out_dput:
2427 	dput(dentry);
2428 out_unlock:
2429 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2430 	path_put(&nd.path);
2431 	putname(tmp);
2432 
2433 	return error;
2434 }
2435 
2436 SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
2437 {
2438 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
2439 }
2440 
2441 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2442 {
2443 	int error = may_create(dir, dentry);
2444 
2445 	if (error)
2446 		return error;
2447 
2448 	if (!dir->i_op->mkdir)
2449 		return -EPERM;
2450 
2451 	mode &= (S_IRWXUGO|S_ISVTX);
2452 	error = security_inode_mkdir(dir, dentry, mode);
2453 	if (error)
2454 		return error;
2455 
2456 	error = dir->i_op->mkdir(dir, dentry, mode);
2457 	if (!error)
2458 		fsnotify_mkdir(dir, dentry);
2459 	return error;
2460 }
2461 
2462 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2463 {
2464 	int error = 0;
2465 	char * tmp;
2466 	struct dentry *dentry;
2467 	struct nameidata nd;
2468 
2469 	error = user_path_parent(dfd, pathname, &nd, &tmp);
2470 	if (error)
2471 		goto out_err;
2472 
2473 	dentry = lookup_create(&nd, 1);
2474 	error = PTR_ERR(dentry);
2475 	if (IS_ERR(dentry))
2476 		goto out_unlock;
2477 
2478 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2479 		mode &= ~current_umask();
2480 	error = mnt_want_write(nd.path.mnt);
2481 	if (error)
2482 		goto out_dput;
2483 	error = security_path_mkdir(&nd.path, dentry, mode);
2484 	if (error)
2485 		goto out_drop_write;
2486 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2487 out_drop_write:
2488 	mnt_drop_write(nd.path.mnt);
2489 out_dput:
2490 	dput(dentry);
2491 out_unlock:
2492 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2493 	path_put(&nd.path);
2494 	putname(tmp);
2495 out_err:
2496 	return error;
2497 }
2498 
2499 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2500 {
2501 	return sys_mkdirat(AT_FDCWD, pathname, mode);
2502 }
2503 
2504 /*
2505  * The dentry_unhash() helper will try to drop the dentry early: we
2506  * should have a usage count of 2 if we're the only user of this
2507  * dentry, and if that is true (possibly after pruning the dcache),
2508  * then we drop the dentry now.
2509  *
2510  * A low-level filesystem can, if it choses, legally
2511  * do a
2512  *
2513  *	if (!d_unhashed(dentry))
2514  *		return -EBUSY;
2515  *
2516  * if it cannot handle the case of removing a directory
2517  * that is still in use by something else..
2518  */
2519 void dentry_unhash(struct dentry *dentry)
2520 {
2521 	shrink_dcache_parent(dentry);
2522 	spin_lock(&dentry->d_lock);
2523 	if (dentry->d_count == 1)
2524 		__d_drop(dentry);
2525 	spin_unlock(&dentry->d_lock);
2526 }
2527 
2528 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2529 {
2530 	int error = may_delete(dir, dentry, 1);
2531 
2532 	if (error)
2533 		return error;
2534 
2535 	if (!dir->i_op->rmdir)
2536 		return -EPERM;
2537 
2538 	mutex_lock(&dentry->d_inode->i_mutex);
2539 
2540 	error = -EBUSY;
2541 	if (d_mountpoint(dentry))
2542 		goto out;
2543 
2544 	error = security_inode_rmdir(dir, dentry);
2545 	if (error)
2546 		goto out;
2547 
2548 	shrink_dcache_parent(dentry);
2549 	error = dir->i_op->rmdir(dir, dentry);
2550 	if (error)
2551 		goto out;
2552 
2553 	dentry->d_inode->i_flags |= S_DEAD;
2554 	dont_mount(dentry);
2555 
2556 out:
2557 	mutex_unlock(&dentry->d_inode->i_mutex);
2558 	if (!error)
2559 		d_delete(dentry);
2560 	return error;
2561 }
2562 
2563 static long do_rmdir(int dfd, const char __user *pathname)
2564 {
2565 	int error = 0;
2566 	char * name;
2567 	struct dentry *dentry;
2568 	struct nameidata nd;
2569 
2570 	error = user_path_parent(dfd, pathname, &nd, &name);
2571 	if (error)
2572 		return error;
2573 
2574 	switch(nd.last_type) {
2575 	case LAST_DOTDOT:
2576 		error = -ENOTEMPTY;
2577 		goto exit1;
2578 	case LAST_DOT:
2579 		error = -EINVAL;
2580 		goto exit1;
2581 	case LAST_ROOT:
2582 		error = -EBUSY;
2583 		goto exit1;
2584 	}
2585 
2586 	nd.flags &= ~LOOKUP_PARENT;
2587 
2588 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2589 	dentry = lookup_hash(&nd);
2590 	error = PTR_ERR(dentry);
2591 	if (IS_ERR(dentry))
2592 		goto exit2;
2593 	if (!dentry->d_inode) {
2594 		error = -ENOENT;
2595 		goto exit3;
2596 	}
2597 	error = mnt_want_write(nd.path.mnt);
2598 	if (error)
2599 		goto exit3;
2600 	error = security_path_rmdir(&nd.path, dentry);
2601 	if (error)
2602 		goto exit4;
2603 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2604 exit4:
2605 	mnt_drop_write(nd.path.mnt);
2606 exit3:
2607 	dput(dentry);
2608 exit2:
2609 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2610 exit1:
2611 	path_put(&nd.path);
2612 	putname(name);
2613 	return error;
2614 }
2615 
2616 SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
2617 {
2618 	return do_rmdir(AT_FDCWD, pathname);
2619 }
2620 
2621 int vfs_unlink(struct inode *dir, struct dentry *dentry)
2622 {
2623 	int error = may_delete(dir, dentry, 0);
2624 
2625 	if (error)
2626 		return error;
2627 
2628 	if (!dir->i_op->unlink)
2629 		return -EPERM;
2630 
2631 	mutex_lock(&dentry->d_inode->i_mutex);
2632 	if (d_mountpoint(dentry))
2633 		error = -EBUSY;
2634 	else {
2635 		error = security_inode_unlink(dir, dentry);
2636 		if (!error) {
2637 			error = dir->i_op->unlink(dir, dentry);
2638 			if (!error)
2639 				dont_mount(dentry);
2640 		}
2641 	}
2642 	mutex_unlock(&dentry->d_inode->i_mutex);
2643 
2644 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
2645 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2646 		fsnotify_link_count(dentry->d_inode);
2647 		d_delete(dentry);
2648 	}
2649 
2650 	return error;
2651 }
2652 
2653 /*
2654  * Make sure that the actual truncation of the file will occur outside its
2655  * directory's i_mutex.  Truncate can take a long time if there is a lot of
2656  * writeout happening, and we don't want to prevent access to the directory
2657  * while waiting on the I/O.
2658  */
2659 static long do_unlinkat(int dfd, const char __user *pathname)
2660 {
2661 	int error;
2662 	char *name;
2663 	struct dentry *dentry;
2664 	struct nameidata nd;
2665 	struct inode *inode = NULL;
2666 
2667 	error = user_path_parent(dfd, pathname, &nd, &name);
2668 	if (error)
2669 		return error;
2670 
2671 	error = -EISDIR;
2672 	if (nd.last_type != LAST_NORM)
2673 		goto exit1;
2674 
2675 	nd.flags &= ~LOOKUP_PARENT;
2676 
2677 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2678 	dentry = lookup_hash(&nd);
2679 	error = PTR_ERR(dentry);
2680 	if (!IS_ERR(dentry)) {
2681 		/* Why not before? Because we want correct error value */
2682 		if (nd.last.name[nd.last.len])
2683 			goto slashes;
2684 		inode = dentry->d_inode;
2685 		if (!inode)
2686 			goto slashes;
2687 		ihold(inode);
2688 		error = mnt_want_write(nd.path.mnt);
2689 		if (error)
2690 			goto exit2;
2691 		error = security_path_unlink(&nd.path, dentry);
2692 		if (error)
2693 			goto exit3;
2694 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2695 exit3:
2696 		mnt_drop_write(nd.path.mnt);
2697 	exit2:
2698 		dput(dentry);
2699 	}
2700 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2701 	if (inode)
2702 		iput(inode);	/* truncate the inode here */
2703 exit1:
2704 	path_put(&nd.path);
2705 	putname(name);
2706 	return error;
2707 
2708 slashes:
2709 	error = !dentry->d_inode ? -ENOENT :
2710 		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2711 	goto exit2;
2712 }
2713 
2714 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
2715 {
2716 	if ((flag & ~AT_REMOVEDIR) != 0)
2717 		return -EINVAL;
2718 
2719 	if (flag & AT_REMOVEDIR)
2720 		return do_rmdir(dfd, pathname);
2721 
2722 	return do_unlinkat(dfd, pathname);
2723 }
2724 
2725 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
2726 {
2727 	return do_unlinkat(AT_FDCWD, pathname);
2728 }
2729 
2730 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2731 {
2732 	int error = may_create(dir, dentry);
2733 
2734 	if (error)
2735 		return error;
2736 
2737 	if (!dir->i_op->symlink)
2738 		return -EPERM;
2739 
2740 	error = security_inode_symlink(dir, dentry, oldname);
2741 	if (error)
2742 		return error;
2743 
2744 	error = dir->i_op->symlink(dir, dentry, oldname);
2745 	if (!error)
2746 		fsnotify_create(dir, dentry);
2747 	return error;
2748 }
2749 
2750 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
2751 		int, newdfd, const char __user *, newname)
2752 {
2753 	int error;
2754 	char *from;
2755 	char *to;
2756 	struct dentry *dentry;
2757 	struct nameidata nd;
2758 
2759 	from = getname(oldname);
2760 	if (IS_ERR(from))
2761 		return PTR_ERR(from);
2762 
2763 	error = user_path_parent(newdfd, newname, &nd, &to);
2764 	if (error)
2765 		goto out_putname;
2766 
2767 	dentry = lookup_create(&nd, 0);
2768 	error = PTR_ERR(dentry);
2769 	if (IS_ERR(dentry))
2770 		goto out_unlock;
2771 
2772 	error = mnt_want_write(nd.path.mnt);
2773 	if (error)
2774 		goto out_dput;
2775 	error = security_path_symlink(&nd.path, dentry, from);
2776 	if (error)
2777 		goto out_drop_write;
2778 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
2779 out_drop_write:
2780 	mnt_drop_write(nd.path.mnt);
2781 out_dput:
2782 	dput(dentry);
2783 out_unlock:
2784 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2785 	path_put(&nd.path);
2786 	putname(to);
2787 out_putname:
2788 	putname(from);
2789 	return error;
2790 }
2791 
2792 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
2793 {
2794 	return sys_symlinkat(oldname, AT_FDCWD, newname);
2795 }
2796 
2797 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2798 {
2799 	struct inode *inode = old_dentry->d_inode;
2800 	int error;
2801 
2802 	if (!inode)
2803 		return -ENOENT;
2804 
2805 	error = may_create(dir, new_dentry);
2806 	if (error)
2807 		return error;
2808 
2809 	if (dir->i_sb != inode->i_sb)
2810 		return -EXDEV;
2811 
2812 	/*
2813 	 * A link to an append-only or immutable file cannot be created.
2814 	 */
2815 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2816 		return -EPERM;
2817 	if (!dir->i_op->link)
2818 		return -EPERM;
2819 	if (S_ISDIR(inode->i_mode))
2820 		return -EPERM;
2821 
2822 	error = security_inode_link(old_dentry, dir, new_dentry);
2823 	if (error)
2824 		return error;
2825 
2826 	mutex_lock(&inode->i_mutex);
2827 	/* Make sure we don't allow creating hardlink to an unlinked file */
2828 	if (inode->i_nlink == 0)
2829 		error =  -ENOENT;
2830 	else
2831 		error = dir->i_op->link(old_dentry, dir, new_dentry);
2832 	mutex_unlock(&inode->i_mutex);
2833 	if (!error)
2834 		fsnotify_link(dir, inode, new_dentry);
2835 	return error;
2836 }
2837 
2838 /*
2839  * Hardlinks are often used in delicate situations.  We avoid
2840  * security-related surprises by not following symlinks on the
2841  * newname.  --KAB
2842  *
2843  * We don't follow them on the oldname either to be compatible
2844  * with linux 2.0, and to avoid hard-linking to directories
2845  * and other special files.  --ADM
2846  */
2847 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
2848 		int, newdfd, const char __user *, newname, int, flags)
2849 {
2850 	struct dentry *new_dentry;
2851 	struct nameidata nd;
2852 	struct path old_path;
2853 	int how = 0;
2854 	int error;
2855 	char *to;
2856 
2857 	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
2858 		return -EINVAL;
2859 	/*
2860 	 * To use null names we require CAP_DAC_READ_SEARCH
2861 	 * This ensures that not everyone will be able to create
2862 	 * handlink using the passed filedescriptor.
2863 	 */
2864 	if (flags & AT_EMPTY_PATH) {
2865 		if (!capable(CAP_DAC_READ_SEARCH))
2866 			return -ENOENT;
2867 		how = LOOKUP_EMPTY;
2868 	}
2869 
2870 	if (flags & AT_SYMLINK_FOLLOW)
2871 		how |= LOOKUP_FOLLOW;
2872 
2873 	error = user_path_at(olddfd, oldname, how, &old_path);
2874 	if (error)
2875 		return error;
2876 
2877 	error = user_path_parent(newdfd, newname, &nd, &to);
2878 	if (error)
2879 		goto out;
2880 	error = -EXDEV;
2881 	if (old_path.mnt != nd.path.mnt)
2882 		goto out_release;
2883 	new_dentry = lookup_create(&nd, 0);
2884 	error = PTR_ERR(new_dentry);
2885 	if (IS_ERR(new_dentry))
2886 		goto out_unlock;
2887 	error = mnt_want_write(nd.path.mnt);
2888 	if (error)
2889 		goto out_dput;
2890 	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
2891 	if (error)
2892 		goto out_drop_write;
2893 	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
2894 out_drop_write:
2895 	mnt_drop_write(nd.path.mnt);
2896 out_dput:
2897 	dput(new_dentry);
2898 out_unlock:
2899 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2900 out_release:
2901 	path_put(&nd.path);
2902 	putname(to);
2903 out:
2904 	path_put(&old_path);
2905 
2906 	return error;
2907 }
2908 
2909 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
2910 {
2911 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2912 }
2913 
2914 /*
2915  * The worst of all namespace operations - renaming directory. "Perverted"
2916  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2917  * Problems:
2918  *	a) we can get into loop creation. Check is done in is_subdir().
2919  *	b) race potential - two innocent renames can create a loop together.
2920  *	   That's where 4.4 screws up. Current fix: serialization on
2921  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
2922  *	   story.
2923  *	c) we have to lock _three_ objects - parents and victim (if it exists).
2924  *	   And that - after we got ->i_mutex on parents (until then we don't know
2925  *	   whether the target exists).  Solution: try to be smart with locking
2926  *	   order for inodes.  We rely on the fact that tree topology may change
2927  *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
2928  *	   move will be locked.  Thus we can rank directories by the tree
2929  *	   (ancestors first) and rank all non-directories after them.
2930  *	   That works since everybody except rename does "lock parent, lookup,
2931  *	   lock child" and rename is under ->s_vfs_rename_mutex.
2932  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
2933  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
2934  *	   we'd better make sure that there's no link(2) for them.
2935  *	d) conversion from fhandle to dentry may come in the wrong moment - when
2936  *	   we are removing the target. Solution: we will have to grab ->i_mutex
2937  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2938  *	   ->i_mutex on parents, which works but leads to some truly excessive
2939  *	   locking].
2940  */
2941 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2942 			  struct inode *new_dir, struct dentry *new_dentry)
2943 {
2944 	int error = 0;
2945 	struct inode *target = new_dentry->d_inode;
2946 
2947 	/*
2948 	 * If we are going to change the parent - check write permissions,
2949 	 * we'll need to flip '..'.
2950 	 */
2951 	if (new_dir != old_dir) {
2952 		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
2953 		if (error)
2954 			return error;
2955 	}
2956 
2957 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2958 	if (error)
2959 		return error;
2960 
2961 	if (target)
2962 		mutex_lock(&target->i_mutex);
2963 
2964 	error = -EBUSY;
2965 	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
2966 		goto out;
2967 
2968 	if (target)
2969 		shrink_dcache_parent(new_dentry);
2970 	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2971 	if (error)
2972 		goto out;
2973 
2974 	if (target) {
2975 		target->i_flags |= S_DEAD;
2976 		dont_mount(new_dentry);
2977 	}
2978 out:
2979 	if (target)
2980 		mutex_unlock(&target->i_mutex);
2981 	if (!error)
2982 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2983 			d_move(old_dentry,new_dentry);
2984 	return error;
2985 }
2986 
2987 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2988 			    struct inode *new_dir, struct dentry *new_dentry)
2989 {
2990 	struct inode *target = new_dentry->d_inode;
2991 	int error;
2992 
2993 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2994 	if (error)
2995 		return error;
2996 
2997 	dget(new_dentry);
2998 	if (target)
2999 		mutex_lock(&target->i_mutex);
3000 
3001 	error = -EBUSY;
3002 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
3003 		goto out;
3004 
3005 	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3006 	if (error)
3007 		goto out;
3008 
3009 	if (target)
3010 		dont_mount(new_dentry);
3011 	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3012 		d_move(old_dentry, new_dentry);
3013 out:
3014 	if (target)
3015 		mutex_unlock(&target->i_mutex);
3016 	dput(new_dentry);
3017 	return error;
3018 }
3019 
3020 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
3021 	       struct inode *new_dir, struct dentry *new_dentry)
3022 {
3023 	int error;
3024 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
3025 	const unsigned char *old_name;
3026 
3027 	if (old_dentry->d_inode == new_dentry->d_inode)
3028  		return 0;
3029 
3030 	error = may_delete(old_dir, old_dentry, is_dir);
3031 	if (error)
3032 		return error;
3033 
3034 	if (!new_dentry->d_inode)
3035 		error = may_create(new_dir, new_dentry);
3036 	else
3037 		error = may_delete(new_dir, new_dentry, is_dir);
3038 	if (error)
3039 		return error;
3040 
3041 	if (!old_dir->i_op->rename)
3042 		return -EPERM;
3043 
3044 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
3045 
3046 	if (is_dir)
3047 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
3048 	else
3049 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
3050 	if (!error)
3051 		fsnotify_move(old_dir, new_dir, old_name, is_dir,
3052 			      new_dentry->d_inode, old_dentry);
3053 	fsnotify_oldname_free(old_name);
3054 
3055 	return error;
3056 }
3057 
3058 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3059 		int, newdfd, const char __user *, newname)
3060 {
3061 	struct dentry *old_dir, *new_dir;
3062 	struct dentry *old_dentry, *new_dentry;
3063 	struct dentry *trap;
3064 	struct nameidata oldnd, newnd;
3065 	char *from;
3066 	char *to;
3067 	int error;
3068 
3069 	error = user_path_parent(olddfd, oldname, &oldnd, &from);
3070 	if (error)
3071 		goto exit;
3072 
3073 	error = user_path_parent(newdfd, newname, &newnd, &to);
3074 	if (error)
3075 		goto exit1;
3076 
3077 	error = -EXDEV;
3078 	if (oldnd.path.mnt != newnd.path.mnt)
3079 		goto exit2;
3080 
3081 	old_dir = oldnd.path.dentry;
3082 	error = -EBUSY;
3083 	if (oldnd.last_type != LAST_NORM)
3084 		goto exit2;
3085 
3086 	new_dir = newnd.path.dentry;
3087 	if (newnd.last_type != LAST_NORM)
3088 		goto exit2;
3089 
3090 	oldnd.flags &= ~LOOKUP_PARENT;
3091 	newnd.flags &= ~LOOKUP_PARENT;
3092 	newnd.flags |= LOOKUP_RENAME_TARGET;
3093 
3094 	trap = lock_rename(new_dir, old_dir);
3095 
3096 	old_dentry = lookup_hash(&oldnd);
3097 	error = PTR_ERR(old_dentry);
3098 	if (IS_ERR(old_dentry))
3099 		goto exit3;
3100 	/* source must exist */
3101 	error = -ENOENT;
3102 	if (!old_dentry->d_inode)
3103 		goto exit4;
3104 	/* unless the source is a directory trailing slashes give -ENOTDIR */
3105 	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
3106 		error = -ENOTDIR;
3107 		if (oldnd.last.name[oldnd.last.len])
3108 			goto exit4;
3109 		if (newnd.last.name[newnd.last.len])
3110 			goto exit4;
3111 	}
3112 	/* source should not be ancestor of target */
3113 	error = -EINVAL;
3114 	if (old_dentry == trap)
3115 		goto exit4;
3116 	new_dentry = lookup_hash(&newnd);
3117 	error = PTR_ERR(new_dentry);
3118 	if (IS_ERR(new_dentry))
3119 		goto exit4;
3120 	/* target should not be an ancestor of source */
3121 	error = -ENOTEMPTY;
3122 	if (new_dentry == trap)
3123 		goto exit5;
3124 
3125 	error = mnt_want_write(oldnd.path.mnt);
3126 	if (error)
3127 		goto exit5;
3128 	error = security_path_rename(&oldnd.path, old_dentry,
3129 				     &newnd.path, new_dentry);
3130 	if (error)
3131 		goto exit6;
3132 	error = vfs_rename(old_dir->d_inode, old_dentry,
3133 				   new_dir->d_inode, new_dentry);
3134 exit6:
3135 	mnt_drop_write(oldnd.path.mnt);
3136 exit5:
3137 	dput(new_dentry);
3138 exit4:
3139 	dput(old_dentry);
3140 exit3:
3141 	unlock_rename(new_dir, old_dir);
3142 exit2:
3143 	path_put(&newnd.path);
3144 	putname(to);
3145 exit1:
3146 	path_put(&oldnd.path);
3147 	putname(from);
3148 exit:
3149 	return error;
3150 }
3151 
3152 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
3153 {
3154 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
3155 }
3156 
3157 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
3158 {
3159 	int len;
3160 
3161 	len = PTR_ERR(link);
3162 	if (IS_ERR(link))
3163 		goto out;
3164 
3165 	len = strlen(link);
3166 	if (len > (unsigned) buflen)
3167 		len = buflen;
3168 	if (copy_to_user(buffer, link, len))
3169 		len = -EFAULT;
3170 out:
3171 	return len;
3172 }
3173 
3174 /*
3175  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
3176  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
3177  * using) it for any given inode is up to filesystem.
3178  */
3179 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
3180 {
3181 	struct nameidata nd;
3182 	void *cookie;
3183 	int res;
3184 
3185 	nd.depth = 0;
3186 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
3187 	if (IS_ERR(cookie))
3188 		return PTR_ERR(cookie);
3189 
3190 	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
3191 	if (dentry->d_inode->i_op->put_link)
3192 		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
3193 	return res;
3194 }
3195 
3196 int vfs_follow_link(struct nameidata *nd, const char *link)
3197 {
3198 	return __vfs_follow_link(nd, link);
3199 }
3200 
3201 /* get the link contents into pagecache */
3202 static char *page_getlink(struct dentry * dentry, struct page **ppage)
3203 {
3204 	char *kaddr;
3205 	struct page *page;
3206 	struct address_space *mapping = dentry->d_inode->i_mapping;
3207 	page = read_mapping_page(mapping, 0, NULL);
3208 	if (IS_ERR(page))
3209 		return (char*)page;
3210 	*ppage = page;
3211 	kaddr = kmap(page);
3212 	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
3213 	return kaddr;
3214 }
3215 
3216 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
3217 {
3218 	struct page *page = NULL;
3219 	char *s = page_getlink(dentry, &page);
3220 	int res = vfs_readlink(dentry,buffer,buflen,s);
3221 	if (page) {
3222 		kunmap(page);
3223 		page_cache_release(page);
3224 	}
3225 	return res;
3226 }
3227 
3228 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
3229 {
3230 	struct page *page = NULL;
3231 	nd_set_link(nd, page_getlink(dentry, &page));
3232 	return page;
3233 }
3234 
3235 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
3236 {
3237 	struct page *page = cookie;
3238 
3239 	if (page) {
3240 		kunmap(page);
3241 		page_cache_release(page);
3242 	}
3243 }
3244 
3245 /*
3246  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
3247  */
3248 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
3249 {
3250 	struct address_space *mapping = inode->i_mapping;
3251 	struct page *page;
3252 	void *fsdata;
3253 	int err;
3254 	char *kaddr;
3255 	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
3256 	if (nofs)
3257 		flags |= AOP_FLAG_NOFS;
3258 
3259 retry:
3260 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
3261 				flags, &page, &fsdata);
3262 	if (err)
3263 		goto fail;
3264 
3265 	kaddr = kmap_atomic(page, KM_USER0);
3266 	memcpy(kaddr, symname, len-1);
3267 	kunmap_atomic(kaddr, KM_USER0);
3268 
3269 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
3270 							page, fsdata);
3271 	if (err < 0)
3272 		goto fail;
3273 	if (err < len-1)
3274 		goto retry;
3275 
3276 	mark_inode_dirty(inode);
3277 	return 0;
3278 fail:
3279 	return err;
3280 }
3281 
3282 int page_symlink(struct inode *inode, const char *symname, int len)
3283 {
3284 	return __page_symlink(inode, symname, len,
3285 			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
3286 }
3287 
3288 const struct inode_operations page_symlink_inode_operations = {
3289 	.readlink	= generic_readlink,
3290 	.follow_link	= page_follow_link_light,
3291 	.put_link	= page_put_link,
3292 };
3293 
3294 EXPORT_SYMBOL(user_path_at);
3295 EXPORT_SYMBOL(follow_down_one);
3296 EXPORT_SYMBOL(follow_down);
3297 EXPORT_SYMBOL(follow_up);
3298 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
3299 EXPORT_SYMBOL(getname);
3300 EXPORT_SYMBOL(lock_rename);
3301 EXPORT_SYMBOL(lookup_one_len);
3302 EXPORT_SYMBOL(page_follow_link_light);
3303 EXPORT_SYMBOL(page_put_link);
3304 EXPORT_SYMBOL(page_readlink);
3305 EXPORT_SYMBOL(__page_symlink);
3306 EXPORT_SYMBOL(page_symlink);
3307 EXPORT_SYMBOL(page_symlink_inode_operations);
3308 EXPORT_SYMBOL(kern_path_parent);
3309 EXPORT_SYMBOL(kern_path);
3310 EXPORT_SYMBOL(vfs_path_lookup);
3311 EXPORT_SYMBOL(inode_permission);
3312 EXPORT_SYMBOL(unlock_rename);
3313 EXPORT_SYMBOL(vfs_create);
3314 EXPORT_SYMBOL(vfs_follow_link);
3315 EXPORT_SYMBOL(vfs_link);
3316 EXPORT_SYMBOL(vfs_mkdir);
3317 EXPORT_SYMBOL(vfs_mknod);
3318 EXPORT_SYMBOL(generic_permission);
3319 EXPORT_SYMBOL(vfs_readlink);
3320 EXPORT_SYMBOL(vfs_rename);
3321 EXPORT_SYMBOL(vfs_rmdir);
3322 EXPORT_SYMBOL(vfs_symlink);
3323 EXPORT_SYMBOL(vfs_unlink);
3324 EXPORT_SYMBOL(dentry_unhash);
3325 EXPORT_SYMBOL(generic_readlink);
3326