xref: /openbmc/linux/fs/namei.c (revision 732a675a)
1 /*
2  *  linux/fs/namei.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 /*
8  * Some corrections by tytso.
9  */
10 
11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12  * lookup logic.
13  */
14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15  */
16 
17 #include <linux/init.h>
18 #include <linux/module.h>
19 #include <linux/slab.h>
20 #include <linux/fs.h>
21 #include <linux/namei.h>
22 #include <linux/quotaops.h>
23 #include <linux/pagemap.h>
24 #include <linux/fsnotify.h>
25 #include <linux/personality.h>
26 #include <linux/security.h>
27 #include <linux/syscalls.h>
28 #include <linux/mount.h>
29 #include <linux/audit.h>
30 #include <linux/capability.h>
31 #include <linux/file.h>
32 #include <linux/fcntl.h>
33 #include <linux/device_cgroup.h>
34 #include <asm/namei.h>
35 #include <asm/uaccess.h>
36 
37 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
38 
39 /* [Feb-1997 T. Schoebel-Theuer]
40  * Fundamental changes in the pathname lookup mechanisms (namei)
41  * were necessary because of omirr.  The reason is that omirr needs
42  * to know the _real_ pathname, not the user-supplied one, in case
43  * of symlinks (and also when transname replacements occur).
44  *
45  * The new code replaces the old recursive symlink resolution with
46  * an iterative one (in case of non-nested symlink chains).  It does
47  * this with calls to <fs>_follow_link().
48  * As a side effect, dir_namei(), _namei() and follow_link() are now
49  * replaced with a single function lookup_dentry() that can handle all
50  * the special cases of the former code.
51  *
52  * With the new dcache, the pathname is stored at each inode, at least as
53  * long as the refcount of the inode is positive.  As a side effect, the
54  * size of the dcache depends on the inode cache and thus is dynamic.
55  *
56  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
57  * resolution to correspond with current state of the code.
58  *
59  * Note that the symlink resolution is not *completely* iterative.
60  * There is still a significant amount of tail- and mid- recursion in
61  * the algorithm.  Also, note that <fs>_readlink() is not used in
62  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
63  * may return different results than <fs>_follow_link().  Many virtual
64  * filesystems (including /proc) exhibit this behavior.
65  */
66 
67 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
68  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
69  * and the name already exists in form of a symlink, try to create the new
70  * name indicated by the symlink. The old code always complained that the
71  * name already exists, due to not following the symlink even if its target
72  * is nonexistent.  The new semantics affects also mknod() and link() when
73  * the name is a symlink pointing to a non-existant name.
74  *
75  * I don't know which semantics is the right one, since I have no access
76  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
77  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
78  * "old" one. Personally, I think the new semantics is much more logical.
79  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
80  * file does succeed in both HP-UX and SunOs, but not in Solaris
81  * and in the old Linux semantics.
82  */
83 
84 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
85  * semantics.  See the comments in "open_namei" and "do_link" below.
86  *
87  * [10-Sep-98 Alan Modra] Another symlink change.
88  */
89 
90 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
91  *	inside the path - always follow.
92  *	in the last component in creation/removal/renaming - never follow.
93  *	if LOOKUP_FOLLOW passed - follow.
94  *	if the pathname has trailing slashes - follow.
95  *	otherwise - don't follow.
96  * (applied in that order).
97  *
98  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
99  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
100  * During the 2.4 we need to fix the userland stuff depending on it -
101  * hopefully we will be able to get rid of that wart in 2.5. So far only
102  * XEmacs seems to be relying on it...
103  */
104 /*
105  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
106  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
107  * any extra contention...
108  */
109 
110 static int __link_path_walk(const char *name, struct nameidata *nd);
111 
112 /* In order to reduce some races, while at the same time doing additional
113  * checking and hopefully speeding things up, we copy filenames to the
114  * kernel data space before using them..
115  *
116  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
117  * PATH_MAX includes the nul terminator --RR.
118  */
119 static int do_getname(const char __user *filename, char *page)
120 {
121 	int retval;
122 	unsigned long len = PATH_MAX;
123 
124 	if (!segment_eq(get_fs(), KERNEL_DS)) {
125 		if ((unsigned long) filename >= TASK_SIZE)
126 			return -EFAULT;
127 		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
128 			len = TASK_SIZE - (unsigned long) filename;
129 	}
130 
131 	retval = strncpy_from_user(page, filename, len);
132 	if (retval > 0) {
133 		if (retval < len)
134 			return 0;
135 		return -ENAMETOOLONG;
136 	} else if (!retval)
137 		retval = -ENOENT;
138 	return retval;
139 }
140 
141 char * getname(const char __user * filename)
142 {
143 	char *tmp, *result;
144 
145 	result = ERR_PTR(-ENOMEM);
146 	tmp = __getname();
147 	if (tmp)  {
148 		int retval = do_getname(filename, tmp);
149 
150 		result = tmp;
151 		if (retval < 0) {
152 			__putname(tmp);
153 			result = ERR_PTR(retval);
154 		}
155 	}
156 	audit_getname(result);
157 	return result;
158 }
159 
160 #ifdef CONFIG_AUDITSYSCALL
161 void putname(const char *name)
162 {
163 	if (unlikely(!audit_dummy_context()))
164 		audit_putname(name);
165 	else
166 		__putname(name);
167 }
168 EXPORT_SYMBOL(putname);
169 #endif
170 
171 
172 /**
173  * generic_permission  -  check for access rights on a Posix-like filesystem
174  * @inode:	inode to check access rights for
175  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
176  * @check_acl:	optional callback to check for Posix ACLs
177  *
178  * Used to check for read/write/execute permissions on a file.
179  * We use "fsuid" for this, letting us set arbitrary permissions
180  * for filesystem access without changing the "normal" uids which
181  * are used for other things..
182  */
183 int generic_permission(struct inode *inode, int mask,
184 		int (*check_acl)(struct inode *inode, int mask))
185 {
186 	umode_t			mode = inode->i_mode;
187 
188 	if (current->fsuid == inode->i_uid)
189 		mode >>= 6;
190 	else {
191 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
192 			int error = check_acl(inode, mask);
193 			if (error == -EACCES)
194 				goto check_capabilities;
195 			else if (error != -EAGAIN)
196 				return error;
197 		}
198 
199 		if (in_group_p(inode->i_gid))
200 			mode >>= 3;
201 	}
202 
203 	/*
204 	 * If the DACs are ok we don't need any capability check.
205 	 */
206 	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
207 		return 0;
208 
209  check_capabilities:
210 	/*
211 	 * Read/write DACs are always overridable.
212 	 * Executable DACs are overridable if at least one exec bit is set.
213 	 */
214 	if (!(mask & MAY_EXEC) ||
215 	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
216 		if (capable(CAP_DAC_OVERRIDE))
217 			return 0;
218 
219 	/*
220 	 * Searching includes executable on directories, else just read.
221 	 */
222 	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
223 		if (capable(CAP_DAC_READ_SEARCH))
224 			return 0;
225 
226 	return -EACCES;
227 }
228 
229 int permission(struct inode *inode, int mask, struct nameidata *nd)
230 {
231 	int retval, submask;
232 	struct vfsmount *mnt = NULL;
233 
234 	if (nd)
235 		mnt = nd->path.mnt;
236 
237 	if (mask & MAY_WRITE) {
238 		umode_t mode = inode->i_mode;
239 
240 		/*
241 		 * Nobody gets write access to a read-only fs.
242 		 */
243 		if (IS_RDONLY(inode) &&
244 		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
245 			return -EROFS;
246 
247 		/*
248 		 * Nobody gets write access to an immutable file.
249 		 */
250 		if (IS_IMMUTABLE(inode))
251 			return -EACCES;
252 	}
253 
254 	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
255 		/*
256 		 * MAY_EXEC on regular files is denied if the fs is mounted
257 		 * with the "noexec" flag.
258 		 */
259 		if (mnt && (mnt->mnt_flags & MNT_NOEXEC))
260 			return -EACCES;
261 	}
262 
263 	/* Ordinary permission routines do not understand MAY_APPEND. */
264 	submask = mask & ~MAY_APPEND;
265 	if (inode->i_op && inode->i_op->permission) {
266 		retval = inode->i_op->permission(inode, submask, nd);
267 		if (!retval) {
268 			/*
269 			 * Exec permission on a regular file is denied if none
270 			 * of the execute bits are set.
271 			 *
272 			 * This check should be done by the ->permission()
273 			 * method.
274 			 */
275 			if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode) &&
276 			    !(inode->i_mode & S_IXUGO))
277 				return -EACCES;
278 		}
279 	} else {
280 		retval = generic_permission(inode, submask, NULL);
281 	}
282 	if (retval)
283 		return retval;
284 
285 	retval = devcgroup_inode_permission(inode, mask);
286 	if (retval)
287 		return retval;
288 
289 	return security_inode_permission(inode, mask, nd);
290 }
291 
292 /**
293  * vfs_permission  -  check for access rights to a given path
294  * @nd:		lookup result that describes the path
295  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
296  *
297  * Used to check for read/write/execute permissions on a path.
298  * We use "fsuid" for this, letting us set arbitrary permissions
299  * for filesystem access without changing the "normal" uids which
300  * are used for other things.
301  */
302 int vfs_permission(struct nameidata *nd, int mask)
303 {
304 	return permission(nd->path.dentry->d_inode, mask, nd);
305 }
306 
307 /**
308  * file_permission  -  check for additional access rights to a given file
309  * @file:	file to check access rights for
310  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
311  *
312  * Used to check for read/write/execute permissions on an already opened
313  * file.
314  *
315  * Note:
316  *	Do not use this function in new code.  All access checks should
317  *	be done using vfs_permission().
318  */
319 int file_permission(struct file *file, int mask)
320 {
321 	return permission(file->f_path.dentry->d_inode, mask, NULL);
322 }
323 
324 /*
325  * get_write_access() gets write permission for a file.
326  * put_write_access() releases this write permission.
327  * This is used for regular files.
328  * We cannot support write (and maybe mmap read-write shared) accesses and
329  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
330  * can have the following values:
331  * 0: no writers, no VM_DENYWRITE mappings
332  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
333  * > 0: (i_writecount) users are writing to the file.
334  *
335  * Normally we operate on that counter with atomic_{inc,dec} and it's safe
336  * except for the cases where we don't hold i_writecount yet. Then we need to
337  * use {get,deny}_write_access() - these functions check the sign and refuse
338  * to do the change if sign is wrong. Exclusion between them is provided by
339  * the inode->i_lock spinlock.
340  */
341 
342 int get_write_access(struct inode * inode)
343 {
344 	spin_lock(&inode->i_lock);
345 	if (atomic_read(&inode->i_writecount) < 0) {
346 		spin_unlock(&inode->i_lock);
347 		return -ETXTBSY;
348 	}
349 	atomic_inc(&inode->i_writecount);
350 	spin_unlock(&inode->i_lock);
351 
352 	return 0;
353 }
354 
355 int deny_write_access(struct file * file)
356 {
357 	struct inode *inode = file->f_path.dentry->d_inode;
358 
359 	spin_lock(&inode->i_lock);
360 	if (atomic_read(&inode->i_writecount) > 0) {
361 		spin_unlock(&inode->i_lock);
362 		return -ETXTBSY;
363 	}
364 	atomic_dec(&inode->i_writecount);
365 	spin_unlock(&inode->i_lock);
366 
367 	return 0;
368 }
369 
370 /**
371  * path_get - get a reference to a path
372  * @path: path to get the reference to
373  *
374  * Given a path increment the reference count to the dentry and the vfsmount.
375  */
376 void path_get(struct path *path)
377 {
378 	mntget(path->mnt);
379 	dget(path->dentry);
380 }
381 EXPORT_SYMBOL(path_get);
382 
383 /**
384  * path_put - put a reference to a path
385  * @path: path to put the reference to
386  *
387  * Given a path decrement the reference count to the dentry and the vfsmount.
388  */
389 void path_put(struct path *path)
390 {
391 	dput(path->dentry);
392 	mntput(path->mnt);
393 }
394 EXPORT_SYMBOL(path_put);
395 
396 /**
397  * release_open_intent - free up open intent resources
398  * @nd: pointer to nameidata
399  */
400 void release_open_intent(struct nameidata *nd)
401 {
402 	if (nd->intent.open.file->f_path.dentry == NULL)
403 		put_filp(nd->intent.open.file);
404 	else
405 		fput(nd->intent.open.file);
406 }
407 
408 static inline struct dentry *
409 do_revalidate(struct dentry *dentry, struct nameidata *nd)
410 {
411 	int status = dentry->d_op->d_revalidate(dentry, nd);
412 	if (unlikely(status <= 0)) {
413 		/*
414 		 * The dentry failed validation.
415 		 * If d_revalidate returned 0 attempt to invalidate
416 		 * the dentry otherwise d_revalidate is asking us
417 		 * to return a fail status.
418 		 */
419 		if (!status) {
420 			if (!d_invalidate(dentry)) {
421 				dput(dentry);
422 				dentry = NULL;
423 			}
424 		} else {
425 			dput(dentry);
426 			dentry = ERR_PTR(status);
427 		}
428 	}
429 	return dentry;
430 }
431 
432 /*
433  * Internal lookup() using the new generic dcache.
434  * SMP-safe
435  */
436 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
437 {
438 	struct dentry * dentry = __d_lookup(parent, name);
439 
440 	/* lockess __d_lookup may fail due to concurrent d_move()
441 	 * in some unrelated directory, so try with d_lookup
442 	 */
443 	if (!dentry)
444 		dentry = d_lookup(parent, name);
445 
446 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
447 		dentry = do_revalidate(dentry, nd);
448 
449 	return dentry;
450 }
451 
452 /*
453  * Short-cut version of permission(), for calling by
454  * path_walk(), when dcache lock is held.  Combines parts
455  * of permission() and generic_permission(), and tests ONLY for
456  * MAY_EXEC permission.
457  *
458  * If appropriate, check DAC only.  If not appropriate, or
459  * short-cut DAC fails, then call permission() to do more
460  * complete permission check.
461  */
462 static int exec_permission_lite(struct inode *inode,
463 				       struct nameidata *nd)
464 {
465 	umode_t	mode = inode->i_mode;
466 
467 	if (inode->i_op && inode->i_op->permission)
468 		return -EAGAIN;
469 
470 	if (current->fsuid == inode->i_uid)
471 		mode >>= 6;
472 	else if (in_group_p(inode->i_gid))
473 		mode >>= 3;
474 
475 	if (mode & MAY_EXEC)
476 		goto ok;
477 
478 	if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
479 		goto ok;
480 
481 	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
482 		goto ok;
483 
484 	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
485 		goto ok;
486 
487 	return -EACCES;
488 ok:
489 	return security_inode_permission(inode, MAY_EXEC, nd);
490 }
491 
492 /*
493  * This is called when everything else fails, and we actually have
494  * to go to the low-level filesystem to find out what we should do..
495  *
496  * We get the directory semaphore, and after getting that we also
497  * make sure that nobody added the entry to the dcache in the meantime..
498  * SMP-safe
499  */
500 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
501 {
502 	struct dentry * result;
503 	struct inode *dir = parent->d_inode;
504 
505 	mutex_lock(&dir->i_mutex);
506 	/*
507 	 * First re-do the cached lookup just in case it was created
508 	 * while we waited for the directory semaphore..
509 	 *
510 	 * FIXME! This could use version numbering or similar to
511 	 * avoid unnecessary cache lookups.
512 	 *
513 	 * The "dcache_lock" is purely to protect the RCU list walker
514 	 * from concurrent renames at this point (we mustn't get false
515 	 * negatives from the RCU list walk here, unlike the optimistic
516 	 * fast walk).
517 	 *
518 	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
519 	 */
520 	result = d_lookup(parent, name);
521 	if (!result) {
522 		struct dentry * dentry = d_alloc(parent, name);
523 		result = ERR_PTR(-ENOMEM);
524 		if (dentry) {
525 			result = dir->i_op->lookup(dir, dentry, nd);
526 			if (result)
527 				dput(dentry);
528 			else
529 				result = dentry;
530 		}
531 		mutex_unlock(&dir->i_mutex);
532 		return result;
533 	}
534 
535 	/*
536 	 * Uhhuh! Nasty case: the cache was re-populated while
537 	 * we waited on the semaphore. Need to revalidate.
538 	 */
539 	mutex_unlock(&dir->i_mutex);
540 	if (result->d_op && result->d_op->d_revalidate) {
541 		result = do_revalidate(result, nd);
542 		if (!result)
543 			result = ERR_PTR(-ENOENT);
544 	}
545 	return result;
546 }
547 
548 static int __emul_lookup_dentry(const char *, struct nameidata *);
549 
550 /* SMP-safe */
551 static __always_inline int
552 walk_init_root(const char *name, struct nameidata *nd)
553 {
554 	struct fs_struct *fs = current->fs;
555 
556 	read_lock(&fs->lock);
557 	if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
558 		nd->path = fs->altroot;
559 		path_get(&fs->altroot);
560 		read_unlock(&fs->lock);
561 		if (__emul_lookup_dentry(name,nd))
562 			return 0;
563 		read_lock(&fs->lock);
564 	}
565 	nd->path = fs->root;
566 	path_get(&fs->root);
567 	read_unlock(&fs->lock);
568 	return 1;
569 }
570 
571 /*
572  * Wrapper to retry pathname resolution whenever the underlying
573  * file system returns an ESTALE.
574  *
575  * Retry the whole path once, forcing real lookup requests
576  * instead of relying on the dcache.
577  */
578 static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
579 {
580 	struct path save = nd->path;
581 	int result;
582 
583 	/* make sure the stuff we saved doesn't go away */
584 	dget(save.dentry);
585 	mntget(save.mnt);
586 
587 	result = __link_path_walk(name, nd);
588 	if (result == -ESTALE) {
589 		/* nd->path had been dropped */
590 		nd->path = save;
591 		dget(nd->path.dentry);
592 		mntget(nd->path.mnt);
593 		nd->flags |= LOOKUP_REVAL;
594 		result = __link_path_walk(name, nd);
595 	}
596 
597 	path_put(&save);
598 
599 	return result;
600 }
601 
602 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
603 {
604 	int res = 0;
605 	char *name;
606 	if (IS_ERR(link))
607 		goto fail;
608 
609 	if (*link == '/') {
610 		path_put(&nd->path);
611 		if (!walk_init_root(link, nd))
612 			/* weird __emul_prefix() stuff did it */
613 			goto out;
614 	}
615 	res = link_path_walk(link, nd);
616 out:
617 	if (nd->depth || res || nd->last_type!=LAST_NORM)
618 		return res;
619 	/*
620 	 * If it is an iterative symlinks resolution in open_namei() we
621 	 * have to copy the last component. And all that crap because of
622 	 * bloody create() on broken symlinks. Furrfu...
623 	 */
624 	name = __getname();
625 	if (unlikely(!name)) {
626 		path_put(&nd->path);
627 		return -ENOMEM;
628 	}
629 	strcpy(name, nd->last.name);
630 	nd->last.name = name;
631 	return 0;
632 fail:
633 	path_put(&nd->path);
634 	return PTR_ERR(link);
635 }
636 
637 static void path_put_conditional(struct path *path, struct nameidata *nd)
638 {
639 	dput(path->dentry);
640 	if (path->mnt != nd->path.mnt)
641 		mntput(path->mnt);
642 }
643 
644 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
645 {
646 	dput(nd->path.dentry);
647 	if (nd->path.mnt != path->mnt)
648 		mntput(nd->path.mnt);
649 	nd->path.mnt = path->mnt;
650 	nd->path.dentry = path->dentry;
651 }
652 
653 static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
654 {
655 	int error;
656 	void *cookie;
657 	struct dentry *dentry = path->dentry;
658 
659 	touch_atime(path->mnt, dentry);
660 	nd_set_link(nd, NULL);
661 
662 	if (path->mnt != nd->path.mnt) {
663 		path_to_nameidata(path, nd);
664 		dget(dentry);
665 	}
666 	mntget(path->mnt);
667 	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
668 	error = PTR_ERR(cookie);
669 	if (!IS_ERR(cookie)) {
670 		char *s = nd_get_link(nd);
671 		error = 0;
672 		if (s)
673 			error = __vfs_follow_link(nd, s);
674 		if (dentry->d_inode->i_op->put_link)
675 			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
676 	}
677 	path_put(path);
678 
679 	return error;
680 }
681 
682 /*
683  * This limits recursive symlink follows to 8, while
684  * limiting consecutive symlinks to 40.
685  *
686  * Without that kind of total limit, nasty chains of consecutive
687  * symlinks can cause almost arbitrarily long lookups.
688  */
689 static inline int do_follow_link(struct path *path, struct nameidata *nd)
690 {
691 	int err = -ELOOP;
692 	if (current->link_count >= MAX_NESTED_LINKS)
693 		goto loop;
694 	if (current->total_link_count >= 40)
695 		goto loop;
696 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
697 	cond_resched();
698 	err = security_inode_follow_link(path->dentry, nd);
699 	if (err)
700 		goto loop;
701 	current->link_count++;
702 	current->total_link_count++;
703 	nd->depth++;
704 	err = __do_follow_link(path, nd);
705 	current->link_count--;
706 	nd->depth--;
707 	return err;
708 loop:
709 	path_put_conditional(path, nd);
710 	path_put(&nd->path);
711 	return err;
712 }
713 
714 int follow_up(struct vfsmount **mnt, struct dentry **dentry)
715 {
716 	struct vfsmount *parent;
717 	struct dentry *mountpoint;
718 	spin_lock(&vfsmount_lock);
719 	parent=(*mnt)->mnt_parent;
720 	if (parent == *mnt) {
721 		spin_unlock(&vfsmount_lock);
722 		return 0;
723 	}
724 	mntget(parent);
725 	mountpoint=dget((*mnt)->mnt_mountpoint);
726 	spin_unlock(&vfsmount_lock);
727 	dput(*dentry);
728 	*dentry = mountpoint;
729 	mntput(*mnt);
730 	*mnt = parent;
731 	return 1;
732 }
733 
734 /* no need for dcache_lock, as serialization is taken care in
735  * namespace.c
736  */
737 static int __follow_mount(struct path *path)
738 {
739 	int res = 0;
740 	while (d_mountpoint(path->dentry)) {
741 		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
742 		if (!mounted)
743 			break;
744 		dput(path->dentry);
745 		if (res)
746 			mntput(path->mnt);
747 		path->mnt = mounted;
748 		path->dentry = dget(mounted->mnt_root);
749 		res = 1;
750 	}
751 	return res;
752 }
753 
754 static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
755 {
756 	while (d_mountpoint(*dentry)) {
757 		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
758 		if (!mounted)
759 			break;
760 		dput(*dentry);
761 		mntput(*mnt);
762 		*mnt = mounted;
763 		*dentry = dget(mounted->mnt_root);
764 	}
765 }
766 
767 /* no need for dcache_lock, as serialization is taken care in
768  * namespace.c
769  */
770 int follow_down(struct vfsmount **mnt, struct dentry **dentry)
771 {
772 	struct vfsmount *mounted;
773 
774 	mounted = lookup_mnt(*mnt, *dentry);
775 	if (mounted) {
776 		dput(*dentry);
777 		mntput(*mnt);
778 		*mnt = mounted;
779 		*dentry = dget(mounted->mnt_root);
780 		return 1;
781 	}
782 	return 0;
783 }
784 
785 static __always_inline void follow_dotdot(struct nameidata *nd)
786 {
787 	struct fs_struct *fs = current->fs;
788 
789 	while(1) {
790 		struct vfsmount *parent;
791 		struct dentry *old = nd->path.dentry;
792 
793                 read_lock(&fs->lock);
794 		if (nd->path.dentry == fs->root.dentry &&
795 		    nd->path.mnt == fs->root.mnt) {
796                         read_unlock(&fs->lock);
797 			break;
798 		}
799                 read_unlock(&fs->lock);
800 		spin_lock(&dcache_lock);
801 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
802 			nd->path.dentry = dget(nd->path.dentry->d_parent);
803 			spin_unlock(&dcache_lock);
804 			dput(old);
805 			break;
806 		}
807 		spin_unlock(&dcache_lock);
808 		spin_lock(&vfsmount_lock);
809 		parent = nd->path.mnt->mnt_parent;
810 		if (parent == nd->path.mnt) {
811 			spin_unlock(&vfsmount_lock);
812 			break;
813 		}
814 		mntget(parent);
815 		nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
816 		spin_unlock(&vfsmount_lock);
817 		dput(old);
818 		mntput(nd->path.mnt);
819 		nd->path.mnt = parent;
820 	}
821 	follow_mount(&nd->path.mnt, &nd->path.dentry);
822 }
823 
824 /*
825  *  It's more convoluted than I'd like it to be, but... it's still fairly
826  *  small and for now I'd prefer to have fast path as straight as possible.
827  *  It _is_ time-critical.
828  */
829 static int do_lookup(struct nameidata *nd, struct qstr *name,
830 		     struct path *path)
831 {
832 	struct vfsmount *mnt = nd->path.mnt;
833 	struct dentry *dentry = __d_lookup(nd->path.dentry, name);
834 
835 	if (!dentry)
836 		goto need_lookup;
837 	if (dentry->d_op && dentry->d_op->d_revalidate)
838 		goto need_revalidate;
839 done:
840 	path->mnt = mnt;
841 	path->dentry = dentry;
842 	__follow_mount(path);
843 	return 0;
844 
845 need_lookup:
846 	dentry = real_lookup(nd->path.dentry, name, nd);
847 	if (IS_ERR(dentry))
848 		goto fail;
849 	goto done;
850 
851 need_revalidate:
852 	dentry = do_revalidate(dentry, nd);
853 	if (!dentry)
854 		goto need_lookup;
855 	if (IS_ERR(dentry))
856 		goto fail;
857 	goto done;
858 
859 fail:
860 	return PTR_ERR(dentry);
861 }
862 
863 /*
864  * Name resolution.
865  * This is the basic name resolution function, turning a pathname into
866  * the final dentry. We expect 'base' to be positive and a directory.
867  *
868  * Returns 0 and nd will have valid dentry and mnt on success.
869  * Returns error and drops reference to input namei data on failure.
870  */
871 static int __link_path_walk(const char *name, struct nameidata *nd)
872 {
873 	struct path next;
874 	struct inode *inode;
875 	int err;
876 	unsigned int lookup_flags = nd->flags;
877 
878 	while (*name=='/')
879 		name++;
880 	if (!*name)
881 		goto return_reval;
882 
883 	inode = nd->path.dentry->d_inode;
884 	if (nd->depth)
885 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
886 
887 	/* At this point we know we have a real path component. */
888 	for(;;) {
889 		unsigned long hash;
890 		struct qstr this;
891 		unsigned int c;
892 
893 		nd->flags |= LOOKUP_CONTINUE;
894 		err = exec_permission_lite(inode, nd);
895 		if (err == -EAGAIN)
896 			err = vfs_permission(nd, MAY_EXEC);
897  		if (err)
898 			break;
899 
900 		this.name = name;
901 		c = *(const unsigned char *)name;
902 
903 		hash = init_name_hash();
904 		do {
905 			name++;
906 			hash = partial_name_hash(c, hash);
907 			c = *(const unsigned char *)name;
908 		} while (c && (c != '/'));
909 		this.len = name - (const char *) this.name;
910 		this.hash = end_name_hash(hash);
911 
912 		/* remove trailing slashes? */
913 		if (!c)
914 			goto last_component;
915 		while (*++name == '/');
916 		if (!*name)
917 			goto last_with_slashes;
918 
919 		/*
920 		 * "." and ".." are special - ".." especially so because it has
921 		 * to be able to know about the current root directory and
922 		 * parent relationships.
923 		 */
924 		if (this.name[0] == '.') switch (this.len) {
925 			default:
926 				break;
927 			case 2:
928 				if (this.name[1] != '.')
929 					break;
930 				follow_dotdot(nd);
931 				inode = nd->path.dentry->d_inode;
932 				/* fallthrough */
933 			case 1:
934 				continue;
935 		}
936 		/*
937 		 * See if the low-level filesystem might want
938 		 * to use its own hash..
939 		 */
940 		if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
941 			err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
942 							    &this);
943 			if (err < 0)
944 				break;
945 		}
946 		/* This does the actual lookups.. */
947 		err = do_lookup(nd, &this, &next);
948 		if (err)
949 			break;
950 
951 		err = -ENOENT;
952 		inode = next.dentry->d_inode;
953 		if (!inode)
954 			goto out_dput;
955 		err = -ENOTDIR;
956 		if (!inode->i_op)
957 			goto out_dput;
958 
959 		if (inode->i_op->follow_link) {
960 			err = do_follow_link(&next, nd);
961 			if (err)
962 				goto return_err;
963 			err = -ENOENT;
964 			inode = nd->path.dentry->d_inode;
965 			if (!inode)
966 				break;
967 			err = -ENOTDIR;
968 			if (!inode->i_op)
969 				break;
970 		} else
971 			path_to_nameidata(&next, nd);
972 		err = -ENOTDIR;
973 		if (!inode->i_op->lookup)
974 			break;
975 		continue;
976 		/* here ends the main loop */
977 
978 last_with_slashes:
979 		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
980 last_component:
981 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
982 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
983 		if (lookup_flags & LOOKUP_PARENT)
984 			goto lookup_parent;
985 		if (this.name[0] == '.') switch (this.len) {
986 			default:
987 				break;
988 			case 2:
989 				if (this.name[1] != '.')
990 					break;
991 				follow_dotdot(nd);
992 				inode = nd->path.dentry->d_inode;
993 				/* fallthrough */
994 			case 1:
995 				goto return_reval;
996 		}
997 		if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
998 			err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
999 							    &this);
1000 			if (err < 0)
1001 				break;
1002 		}
1003 		err = do_lookup(nd, &this, &next);
1004 		if (err)
1005 			break;
1006 		inode = next.dentry->d_inode;
1007 		if ((lookup_flags & LOOKUP_FOLLOW)
1008 		    && inode && inode->i_op && inode->i_op->follow_link) {
1009 			err = do_follow_link(&next, nd);
1010 			if (err)
1011 				goto return_err;
1012 			inode = nd->path.dentry->d_inode;
1013 		} else
1014 			path_to_nameidata(&next, nd);
1015 		err = -ENOENT;
1016 		if (!inode)
1017 			break;
1018 		if (lookup_flags & LOOKUP_DIRECTORY) {
1019 			err = -ENOTDIR;
1020 			if (!inode->i_op || !inode->i_op->lookup)
1021 				break;
1022 		}
1023 		goto return_base;
1024 lookup_parent:
1025 		nd->last = this;
1026 		nd->last_type = LAST_NORM;
1027 		if (this.name[0] != '.')
1028 			goto return_base;
1029 		if (this.len == 1)
1030 			nd->last_type = LAST_DOT;
1031 		else if (this.len == 2 && this.name[1] == '.')
1032 			nd->last_type = LAST_DOTDOT;
1033 		else
1034 			goto return_base;
1035 return_reval:
1036 		/*
1037 		 * We bypassed the ordinary revalidation routines.
1038 		 * We may need to check the cached dentry for staleness.
1039 		 */
1040 		if (nd->path.dentry && nd->path.dentry->d_sb &&
1041 		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
1042 			err = -ESTALE;
1043 			/* Note: we do not d_invalidate() */
1044 			if (!nd->path.dentry->d_op->d_revalidate(
1045 					nd->path.dentry, nd))
1046 				break;
1047 		}
1048 return_base:
1049 		return 0;
1050 out_dput:
1051 		path_put_conditional(&next, nd);
1052 		break;
1053 	}
1054 	path_put(&nd->path);
1055 return_err:
1056 	return err;
1057 }
1058 
1059 static int path_walk(const char *name, struct nameidata *nd)
1060 {
1061 	current->total_link_count = 0;
1062 	return link_path_walk(name, nd);
1063 }
1064 
1065 /*
1066  * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
1067  * everything is done. Returns 0 and drops input nd, if lookup failed;
1068  */
1069 static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
1070 {
1071 	if (path_walk(name, nd))
1072 		return 0;		/* something went wrong... */
1073 
1074 	if (!nd->path.dentry->d_inode ||
1075 	    S_ISDIR(nd->path.dentry->d_inode->i_mode)) {
1076 		struct path old_path = nd->path;
1077 		struct qstr last = nd->last;
1078 		int last_type = nd->last_type;
1079 		struct fs_struct *fs = current->fs;
1080 
1081 		/*
1082 		 * NAME was not found in alternate root or it's a directory.
1083 		 * Try to find it in the normal root:
1084 		 */
1085 		nd->last_type = LAST_ROOT;
1086 		read_lock(&fs->lock);
1087 		nd->path = fs->root;
1088 		path_get(&fs->root);
1089 		read_unlock(&fs->lock);
1090 		if (path_walk(name, nd) == 0) {
1091 			if (nd->path.dentry->d_inode) {
1092 				path_put(&old_path);
1093 				return 1;
1094 			}
1095 			path_put(&nd->path);
1096 		}
1097 		nd->path = old_path;
1098 		nd->last = last;
1099 		nd->last_type = last_type;
1100 	}
1101 	return 1;
1102 }
1103 
1104 void set_fs_altroot(void)
1105 {
1106 	char *emul = __emul_prefix();
1107 	struct nameidata nd;
1108 	struct path path = {}, old_path;
1109 	int err;
1110 	struct fs_struct *fs = current->fs;
1111 
1112 	if (!emul)
1113 		goto set_it;
1114 	err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
1115 	if (!err)
1116 		path = nd.path;
1117 set_it:
1118 	write_lock(&fs->lock);
1119 	old_path = fs->altroot;
1120 	fs->altroot = path;
1121 	write_unlock(&fs->lock);
1122 	if (old_path.dentry)
1123 		path_put(&old_path);
1124 }
1125 
1126 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1127 static int do_path_lookup(int dfd, const char *name,
1128 				unsigned int flags, struct nameidata *nd)
1129 {
1130 	int retval = 0;
1131 	int fput_needed;
1132 	struct file *file;
1133 	struct fs_struct *fs = current->fs;
1134 
1135 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1136 	nd->flags = flags;
1137 	nd->depth = 0;
1138 
1139 	if (*name=='/') {
1140 		read_lock(&fs->lock);
1141 		if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
1142 			nd->path = fs->altroot;
1143 			path_get(&fs->altroot);
1144 			read_unlock(&fs->lock);
1145 			if (__emul_lookup_dentry(name,nd))
1146 				goto out; /* found in altroot */
1147 			read_lock(&fs->lock);
1148 		}
1149 		nd->path = fs->root;
1150 		path_get(&fs->root);
1151 		read_unlock(&fs->lock);
1152 	} else if (dfd == AT_FDCWD) {
1153 		read_lock(&fs->lock);
1154 		nd->path = fs->pwd;
1155 		path_get(&fs->pwd);
1156 		read_unlock(&fs->lock);
1157 	} else {
1158 		struct dentry *dentry;
1159 
1160 		file = fget_light(dfd, &fput_needed);
1161 		retval = -EBADF;
1162 		if (!file)
1163 			goto out_fail;
1164 
1165 		dentry = file->f_path.dentry;
1166 
1167 		retval = -ENOTDIR;
1168 		if (!S_ISDIR(dentry->d_inode->i_mode))
1169 			goto fput_fail;
1170 
1171 		retval = file_permission(file, MAY_EXEC);
1172 		if (retval)
1173 			goto fput_fail;
1174 
1175 		nd->path = file->f_path;
1176 		path_get(&file->f_path);
1177 
1178 		fput_light(file, fput_needed);
1179 	}
1180 
1181 	retval = path_walk(name, nd);
1182 out:
1183 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1184 				nd->path.dentry->d_inode))
1185 		audit_inode(name, nd->path.dentry);
1186 out_fail:
1187 	return retval;
1188 
1189 fput_fail:
1190 	fput_light(file, fput_needed);
1191 	goto out_fail;
1192 }
1193 
1194 int path_lookup(const char *name, unsigned int flags,
1195 			struct nameidata *nd)
1196 {
1197 	return do_path_lookup(AT_FDCWD, name, flags, nd);
1198 }
1199 
1200 /**
1201  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1202  * @dentry:  pointer to dentry of the base directory
1203  * @mnt: pointer to vfs mount of the base directory
1204  * @name: pointer to file name
1205  * @flags: lookup flags
1206  * @nd: pointer to nameidata
1207  */
1208 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1209 		    const char *name, unsigned int flags,
1210 		    struct nameidata *nd)
1211 {
1212 	int retval;
1213 
1214 	/* same as do_path_lookup */
1215 	nd->last_type = LAST_ROOT;
1216 	nd->flags = flags;
1217 	nd->depth = 0;
1218 
1219 	nd->path.mnt = mntget(mnt);
1220 	nd->path.dentry = dget(dentry);
1221 
1222 	retval = path_walk(name, nd);
1223 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1224 				nd->path.dentry->d_inode))
1225 		audit_inode(name, nd->path.dentry);
1226 
1227 	return retval;
1228 
1229 }
1230 
1231 static int __path_lookup_intent_open(int dfd, const char *name,
1232 		unsigned int lookup_flags, struct nameidata *nd,
1233 		int open_flags, int create_mode)
1234 {
1235 	struct file *filp = get_empty_filp();
1236 	int err;
1237 
1238 	if (filp == NULL)
1239 		return -ENFILE;
1240 	nd->intent.open.file = filp;
1241 	nd->intent.open.flags = open_flags;
1242 	nd->intent.open.create_mode = create_mode;
1243 	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
1244 	if (IS_ERR(nd->intent.open.file)) {
1245 		if (err == 0) {
1246 			err = PTR_ERR(nd->intent.open.file);
1247 			path_put(&nd->path);
1248 		}
1249 	} else if (err != 0)
1250 		release_open_intent(nd);
1251 	return err;
1252 }
1253 
1254 /**
1255  * path_lookup_open - lookup a file path with open intent
1256  * @dfd: the directory to use as base, or AT_FDCWD
1257  * @name: pointer to file name
1258  * @lookup_flags: lookup intent flags
1259  * @nd: pointer to nameidata
1260  * @open_flags: open intent flags
1261  */
1262 int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
1263 		struct nameidata *nd, int open_flags)
1264 {
1265 	return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
1266 			open_flags, 0);
1267 }
1268 
1269 /**
1270  * path_lookup_create - lookup a file path with open + create intent
1271  * @dfd: the directory to use as base, or AT_FDCWD
1272  * @name: pointer to file name
1273  * @lookup_flags: lookup intent flags
1274  * @nd: pointer to nameidata
1275  * @open_flags: open intent flags
1276  * @create_mode: create intent flags
1277  */
1278 static int path_lookup_create(int dfd, const char *name,
1279 			      unsigned int lookup_flags, struct nameidata *nd,
1280 			      int open_flags, int create_mode)
1281 {
1282 	return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
1283 			nd, open_flags, create_mode);
1284 }
1285 
1286 int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
1287 		struct nameidata *nd, int open_flags)
1288 {
1289 	char *tmp = getname(name);
1290 	int err = PTR_ERR(tmp);
1291 
1292 	if (!IS_ERR(tmp)) {
1293 		err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
1294 		putname(tmp);
1295 	}
1296 	return err;
1297 }
1298 
1299 static struct dentry *__lookup_hash(struct qstr *name,
1300 		struct dentry *base, struct nameidata *nd)
1301 {
1302 	struct dentry *dentry;
1303 	struct inode *inode;
1304 	int err;
1305 
1306 	inode = base->d_inode;
1307 
1308 	/*
1309 	 * See if the low-level filesystem might want
1310 	 * to use its own hash..
1311 	 */
1312 	if (base->d_op && base->d_op->d_hash) {
1313 		err = base->d_op->d_hash(base, name);
1314 		dentry = ERR_PTR(err);
1315 		if (err < 0)
1316 			goto out;
1317 	}
1318 
1319 	dentry = cached_lookup(base, name, nd);
1320 	if (!dentry) {
1321 		struct dentry *new = d_alloc(base, name);
1322 		dentry = ERR_PTR(-ENOMEM);
1323 		if (!new)
1324 			goto out;
1325 		dentry = inode->i_op->lookup(inode, new, nd);
1326 		if (!dentry)
1327 			dentry = new;
1328 		else
1329 			dput(new);
1330 	}
1331 out:
1332 	return dentry;
1333 }
1334 
1335 /*
1336  * Restricted form of lookup. Doesn't follow links, single-component only,
1337  * needs parent already locked. Doesn't follow mounts.
1338  * SMP-safe.
1339  */
1340 static struct dentry *lookup_hash(struct nameidata *nd)
1341 {
1342 	int err;
1343 
1344 	err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
1345 	if (err)
1346 		return ERR_PTR(err);
1347 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
1348 }
1349 
1350 static int __lookup_one_len(const char *name, struct qstr *this,
1351 		struct dentry *base, int len)
1352 {
1353 	unsigned long hash;
1354 	unsigned int c;
1355 
1356 	this->name = name;
1357 	this->len = len;
1358 	if (!len)
1359 		return -EACCES;
1360 
1361 	hash = init_name_hash();
1362 	while (len--) {
1363 		c = *(const unsigned char *)name++;
1364 		if (c == '/' || c == '\0')
1365 			return -EACCES;
1366 		hash = partial_name_hash(c, hash);
1367 	}
1368 	this->hash = end_name_hash(hash);
1369 	return 0;
1370 }
1371 
1372 /**
1373  * lookup_one_len - filesystem helper to lookup single pathname component
1374  * @name:	pathname component to lookup
1375  * @base:	base directory to lookup from
1376  * @len:	maximum length @len should be interpreted to
1377  *
1378  * Note that this routine is purely a helper for filesystem usage and should
1379  * not be called by generic code.  Also note that by using this function the
1380  * nameidata argument is passed to the filesystem methods and a filesystem
1381  * using this helper needs to be prepared for that.
1382  */
1383 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1384 {
1385 	int err;
1386 	struct qstr this;
1387 
1388 	err = __lookup_one_len(name, &this, base, len);
1389 	if (err)
1390 		return ERR_PTR(err);
1391 
1392 	err = permission(base->d_inode, MAY_EXEC, NULL);
1393 	if (err)
1394 		return ERR_PTR(err);
1395 	return __lookup_hash(&this, base, NULL);
1396 }
1397 
1398 /**
1399  * lookup_one_noperm - bad hack for sysfs
1400  * @name:	pathname component to lookup
1401  * @base:	base directory to lookup from
1402  *
1403  * This is a variant of lookup_one_len that doesn't perform any permission
1404  * checks.   It's a horrible hack to work around the braindead sysfs
1405  * architecture and should not be used anywhere else.
1406  *
1407  * DON'T USE THIS FUNCTION EVER, thanks.
1408  */
1409 struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
1410 {
1411 	int err;
1412 	struct qstr this;
1413 
1414 	err = __lookup_one_len(name, &this, base, strlen(name));
1415 	if (err)
1416 		return ERR_PTR(err);
1417 	return __lookup_hash(&this, base, NULL);
1418 }
1419 
1420 int __user_walk_fd(int dfd, const char __user *name, unsigned flags,
1421 			    struct nameidata *nd)
1422 {
1423 	char *tmp = getname(name);
1424 	int err = PTR_ERR(tmp);
1425 
1426 	if (!IS_ERR(tmp)) {
1427 		err = do_path_lookup(dfd, tmp, flags, nd);
1428 		putname(tmp);
1429 	}
1430 	return err;
1431 }
1432 
1433 int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
1434 {
1435 	return __user_walk_fd(AT_FDCWD, name, flags, nd);
1436 }
1437 
1438 /*
1439  * It's inline, so penalty for filesystems that don't use sticky bit is
1440  * minimal.
1441  */
1442 static inline int check_sticky(struct inode *dir, struct inode *inode)
1443 {
1444 	if (!(dir->i_mode & S_ISVTX))
1445 		return 0;
1446 	if (inode->i_uid == current->fsuid)
1447 		return 0;
1448 	if (dir->i_uid == current->fsuid)
1449 		return 0;
1450 	return !capable(CAP_FOWNER);
1451 }
1452 
1453 /*
1454  *	Check whether we can remove a link victim from directory dir, check
1455  *  whether the type of victim is right.
1456  *  1. We can't do it if dir is read-only (done in permission())
1457  *  2. We should have write and exec permissions on dir
1458  *  3. We can't remove anything from append-only dir
1459  *  4. We can't do anything with immutable dir (done in permission())
1460  *  5. If the sticky bit on dir is set we should either
1461  *	a. be owner of dir, or
1462  *	b. be owner of victim, or
1463  *	c. have CAP_FOWNER capability
1464  *  6. If the victim is append-only or immutable we can't do antyhing with
1465  *     links pointing to it.
1466  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1467  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1468  *  9. We can't remove a root or mountpoint.
1469  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1470  *     nfs_async_unlink().
1471  */
1472 static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1473 {
1474 	int error;
1475 
1476 	if (!victim->d_inode)
1477 		return -ENOENT;
1478 
1479 	BUG_ON(victim->d_parent->d_inode != dir);
1480 	audit_inode_child(victim->d_name.name, victim, dir);
1481 
1482 	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
1483 	if (error)
1484 		return error;
1485 	if (IS_APPEND(dir))
1486 		return -EPERM;
1487 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1488 	    IS_IMMUTABLE(victim->d_inode))
1489 		return -EPERM;
1490 	if (isdir) {
1491 		if (!S_ISDIR(victim->d_inode->i_mode))
1492 			return -ENOTDIR;
1493 		if (IS_ROOT(victim))
1494 			return -EBUSY;
1495 	} else if (S_ISDIR(victim->d_inode->i_mode))
1496 		return -EISDIR;
1497 	if (IS_DEADDIR(dir))
1498 		return -ENOENT;
1499 	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1500 		return -EBUSY;
1501 	return 0;
1502 }
1503 
1504 /*	Check whether we can create an object with dentry child in directory
1505  *  dir.
1506  *  1. We can't do it if child already exists (open has special treatment for
1507  *     this case, but since we are inlined it's OK)
1508  *  2. We can't do it if dir is read-only (done in permission())
1509  *  3. We should have write and exec permissions on dir
1510  *  4. We can't do it if dir is immutable (done in permission())
1511  */
1512 static inline int may_create(struct inode *dir, struct dentry *child,
1513 			     struct nameidata *nd)
1514 {
1515 	if (child->d_inode)
1516 		return -EEXIST;
1517 	if (IS_DEADDIR(dir))
1518 		return -ENOENT;
1519 	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
1520 }
1521 
1522 /*
1523  * O_DIRECTORY translates into forcing a directory lookup.
1524  */
1525 static inline int lookup_flags(unsigned int f)
1526 {
1527 	unsigned long retval = LOOKUP_FOLLOW;
1528 
1529 	if (f & O_NOFOLLOW)
1530 		retval &= ~LOOKUP_FOLLOW;
1531 
1532 	if (f & O_DIRECTORY)
1533 		retval |= LOOKUP_DIRECTORY;
1534 
1535 	return retval;
1536 }
1537 
1538 /*
1539  * p1 and p2 should be directories on the same fs.
1540  */
1541 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1542 {
1543 	struct dentry *p;
1544 
1545 	if (p1 == p2) {
1546 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1547 		return NULL;
1548 	}
1549 
1550 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1551 
1552 	for (p = p1; p->d_parent != p; p = p->d_parent) {
1553 		if (p->d_parent == p2) {
1554 			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1555 			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1556 			return p;
1557 		}
1558 	}
1559 
1560 	for (p = p2; p->d_parent != p; p = p->d_parent) {
1561 		if (p->d_parent == p1) {
1562 			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1563 			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1564 			return p;
1565 		}
1566 	}
1567 
1568 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1569 	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1570 	return NULL;
1571 }
1572 
1573 void unlock_rename(struct dentry *p1, struct dentry *p2)
1574 {
1575 	mutex_unlock(&p1->d_inode->i_mutex);
1576 	if (p1 != p2) {
1577 		mutex_unlock(&p2->d_inode->i_mutex);
1578 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1579 	}
1580 }
1581 
1582 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1583 		struct nameidata *nd)
1584 {
1585 	int error = may_create(dir, dentry, nd);
1586 
1587 	if (error)
1588 		return error;
1589 
1590 	if (!dir->i_op || !dir->i_op->create)
1591 		return -EACCES;	/* shouldn't it be ENOSYS? */
1592 	mode &= S_IALLUGO;
1593 	mode |= S_IFREG;
1594 	error = security_inode_create(dir, dentry, mode);
1595 	if (error)
1596 		return error;
1597 	DQUOT_INIT(dir);
1598 	error = dir->i_op->create(dir, dentry, mode, nd);
1599 	if (!error)
1600 		fsnotify_create(dir, dentry);
1601 	return error;
1602 }
1603 
1604 int may_open(struct nameidata *nd, int acc_mode, int flag)
1605 {
1606 	struct dentry *dentry = nd->path.dentry;
1607 	struct inode *inode = dentry->d_inode;
1608 	int error;
1609 
1610 	if (!inode)
1611 		return -ENOENT;
1612 
1613 	if (S_ISLNK(inode->i_mode))
1614 		return -ELOOP;
1615 
1616 	if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
1617 		return -EISDIR;
1618 
1619 	/*
1620 	 * FIFO's, sockets and device files are special: they don't
1621 	 * actually live on the filesystem itself, and as such you
1622 	 * can write to them even if the filesystem is read-only.
1623 	 */
1624 	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1625 	    	flag &= ~O_TRUNC;
1626 	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1627 		if (nd->path.mnt->mnt_flags & MNT_NODEV)
1628 			return -EACCES;
1629 
1630 		flag &= ~O_TRUNC;
1631 	}
1632 
1633 	error = vfs_permission(nd, acc_mode);
1634 	if (error)
1635 		return error;
1636 	/*
1637 	 * An append-only file must be opened in append mode for writing.
1638 	 */
1639 	if (IS_APPEND(inode)) {
1640 		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1641 			return -EPERM;
1642 		if (flag & O_TRUNC)
1643 			return -EPERM;
1644 	}
1645 
1646 	/* O_NOATIME can only be set by the owner or superuser */
1647 	if (flag & O_NOATIME)
1648 		if (!is_owner_or_cap(inode))
1649 			return -EPERM;
1650 
1651 	/*
1652 	 * Ensure there are no outstanding leases on the file.
1653 	 */
1654 	error = break_lease(inode, flag);
1655 	if (error)
1656 		return error;
1657 
1658 	if (flag & O_TRUNC) {
1659 		error = get_write_access(inode);
1660 		if (error)
1661 			return error;
1662 
1663 		/*
1664 		 * Refuse to truncate files with mandatory locks held on them.
1665 		 */
1666 		error = locks_verify_locked(inode);
1667 		if (!error) {
1668 			DQUOT_INIT(inode);
1669 
1670 			error = do_truncate(dentry, 0,
1671 					    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1672 					    NULL);
1673 		}
1674 		put_write_access(inode);
1675 		if (error)
1676 			return error;
1677 	} else
1678 		if (flag & FMODE_WRITE)
1679 			DQUOT_INIT(inode);
1680 
1681 	return 0;
1682 }
1683 
1684 /*
1685  * Be careful about ever adding any more callers of this
1686  * function.  Its flags must be in the namei format, not
1687  * what get passed to sys_open().
1688  */
1689 static int __open_namei_create(struct nameidata *nd, struct path *path,
1690 				int flag, int mode)
1691 {
1692 	int error;
1693 	struct dentry *dir = nd->path.dentry;
1694 
1695 	if (!IS_POSIXACL(dir->d_inode))
1696 		mode &= ~current->fs->umask;
1697 	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1698 	mutex_unlock(&dir->d_inode->i_mutex);
1699 	dput(nd->path.dentry);
1700 	nd->path.dentry = path->dentry;
1701 	if (error)
1702 		return error;
1703 	/* Don't check for write permission, don't truncate */
1704 	return may_open(nd, 0, flag & ~O_TRUNC);
1705 }
1706 
1707 /*
1708  * Note that while the flag value (low two bits) for sys_open means:
1709  *	00 - read-only
1710  *	01 - write-only
1711  *	10 - read-write
1712  *	11 - special
1713  * it is changed into
1714  *	00 - no permissions needed
1715  *	01 - read-permission
1716  *	10 - write-permission
1717  *	11 - read-write
1718  * for the internal routines (ie open_namei()/follow_link() etc)
1719  * This is more logical, and also allows the 00 "no perm needed"
1720  * to be used for symlinks (where the permissions are checked
1721  * later).
1722  *
1723 */
1724 static inline int open_to_namei_flags(int flag)
1725 {
1726 	if ((flag+1) & O_ACCMODE)
1727 		flag++;
1728 	return flag;
1729 }
1730 
1731 static int open_will_write_to_fs(int flag, struct inode *inode)
1732 {
1733 	/*
1734 	 * We'll never write to the fs underlying
1735 	 * a device file.
1736 	 */
1737 	if (special_file(inode->i_mode))
1738 		return 0;
1739 	return (flag & O_TRUNC);
1740 }
1741 
1742 /*
1743  * Note that the low bits of the passed in "open_flag"
1744  * are not the same as in the local variable "flag". See
1745  * open_to_namei_flags() for more details.
1746  */
1747 struct file *do_filp_open(int dfd, const char *pathname,
1748 		int open_flag, int mode)
1749 {
1750 	struct file *filp;
1751 	struct nameidata nd;
1752 	int acc_mode, error;
1753 	struct path path;
1754 	struct dentry *dir;
1755 	int count = 0;
1756 	int will_write;
1757 	int flag = open_to_namei_flags(open_flag);
1758 
1759 	acc_mode = ACC_MODE(flag);
1760 
1761 	/* O_TRUNC implies we need access checks for write permissions */
1762 	if (flag & O_TRUNC)
1763 		acc_mode |= MAY_WRITE;
1764 
1765 	/* Allow the LSM permission hook to distinguish append
1766 	   access from general write access. */
1767 	if (flag & O_APPEND)
1768 		acc_mode |= MAY_APPEND;
1769 
1770 	/*
1771 	 * The simplest case - just a plain lookup.
1772 	 */
1773 	if (!(flag & O_CREAT)) {
1774 		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
1775 					 &nd, flag);
1776 		if (error)
1777 			return ERR_PTR(error);
1778 		goto ok;
1779 	}
1780 
1781 	/*
1782 	 * Create - we need to know the parent.
1783 	 */
1784 	error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
1785 				   &nd, flag, mode);
1786 	if (error)
1787 		return ERR_PTR(error);
1788 
1789 	/*
1790 	 * We have the parent and last component. First of all, check
1791 	 * that we are not asked to creat(2) an obvious directory - that
1792 	 * will not do.
1793 	 */
1794 	error = -EISDIR;
1795 	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
1796 		goto exit;
1797 
1798 	dir = nd.path.dentry;
1799 	nd.flags &= ~LOOKUP_PARENT;
1800 	mutex_lock(&dir->d_inode->i_mutex);
1801 	path.dentry = lookup_hash(&nd);
1802 	path.mnt = nd.path.mnt;
1803 
1804 do_last:
1805 	error = PTR_ERR(path.dentry);
1806 	if (IS_ERR(path.dentry)) {
1807 		mutex_unlock(&dir->d_inode->i_mutex);
1808 		goto exit;
1809 	}
1810 
1811 	if (IS_ERR(nd.intent.open.file)) {
1812 		error = PTR_ERR(nd.intent.open.file);
1813 		goto exit_mutex_unlock;
1814 	}
1815 
1816 	/* Negative dentry, just create the file */
1817 	if (!path.dentry->d_inode) {
1818 		/*
1819 		 * This write is needed to ensure that a
1820 		 * ro->rw transition does not occur between
1821 		 * the time when the file is created and when
1822 		 * a permanent write count is taken through
1823 		 * the 'struct file' in nameidata_to_filp().
1824 		 */
1825 		error = mnt_want_write(nd.path.mnt);
1826 		if (error)
1827 			goto exit_mutex_unlock;
1828 		error = __open_namei_create(&nd, &path, flag, mode);
1829 		if (error) {
1830 			mnt_drop_write(nd.path.mnt);
1831 			goto exit;
1832 		}
1833 		filp = nameidata_to_filp(&nd, open_flag);
1834 		mnt_drop_write(nd.path.mnt);
1835 		return filp;
1836 	}
1837 
1838 	/*
1839 	 * It already exists.
1840 	 */
1841 	mutex_unlock(&dir->d_inode->i_mutex);
1842 	audit_inode(pathname, path.dentry);
1843 
1844 	error = -EEXIST;
1845 	if (flag & O_EXCL)
1846 		goto exit_dput;
1847 
1848 	if (__follow_mount(&path)) {
1849 		error = -ELOOP;
1850 		if (flag & O_NOFOLLOW)
1851 			goto exit_dput;
1852 	}
1853 
1854 	error = -ENOENT;
1855 	if (!path.dentry->d_inode)
1856 		goto exit_dput;
1857 	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
1858 		goto do_link;
1859 
1860 	path_to_nameidata(&path, &nd);
1861 	error = -EISDIR;
1862 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1863 		goto exit;
1864 ok:
1865 	/*
1866 	 * Consider:
1867 	 * 1. may_open() truncates a file
1868 	 * 2. a rw->ro mount transition occurs
1869 	 * 3. nameidata_to_filp() fails due to
1870 	 *    the ro mount.
1871 	 * That would be inconsistent, and should
1872 	 * be avoided. Taking this mnt write here
1873 	 * ensures that (2) can not occur.
1874 	 */
1875 	will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
1876 	if (will_write) {
1877 		error = mnt_want_write(nd.path.mnt);
1878 		if (error)
1879 			goto exit;
1880 	}
1881 	error = may_open(&nd, acc_mode, flag);
1882 	if (error) {
1883 		if (will_write)
1884 			mnt_drop_write(nd.path.mnt);
1885 		goto exit;
1886 	}
1887 	filp = nameidata_to_filp(&nd, open_flag);
1888 	/*
1889 	 * It is now safe to drop the mnt write
1890 	 * because the filp has had a write taken
1891 	 * on its behalf.
1892 	 */
1893 	if (will_write)
1894 		mnt_drop_write(nd.path.mnt);
1895 	return filp;
1896 
1897 exit_mutex_unlock:
1898 	mutex_unlock(&dir->d_inode->i_mutex);
1899 exit_dput:
1900 	path_put_conditional(&path, &nd);
1901 exit:
1902 	if (!IS_ERR(nd.intent.open.file))
1903 		release_open_intent(&nd);
1904 	path_put(&nd.path);
1905 	return ERR_PTR(error);
1906 
1907 do_link:
1908 	error = -ELOOP;
1909 	if (flag & O_NOFOLLOW)
1910 		goto exit_dput;
1911 	/*
1912 	 * This is subtle. Instead of calling do_follow_link() we do the
1913 	 * thing by hands. The reason is that this way we have zero link_count
1914 	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1915 	 * After that we have the parent and last component, i.e.
1916 	 * we are in the same situation as after the first path_walk().
1917 	 * Well, almost - if the last component is normal we get its copy
1918 	 * stored in nd->last.name and we will have to putname() it when we
1919 	 * are done. Procfs-like symlinks just set LAST_BIND.
1920 	 */
1921 	nd.flags |= LOOKUP_PARENT;
1922 	error = security_inode_follow_link(path.dentry, &nd);
1923 	if (error)
1924 		goto exit_dput;
1925 	error = __do_follow_link(&path, &nd);
1926 	if (error) {
1927 		/* Does someone understand code flow here? Or it is only
1928 		 * me so stupid? Anathema to whoever designed this non-sense
1929 		 * with "intent.open".
1930 		 */
1931 		release_open_intent(&nd);
1932 		return ERR_PTR(error);
1933 	}
1934 	nd.flags &= ~LOOKUP_PARENT;
1935 	if (nd.last_type == LAST_BIND)
1936 		goto ok;
1937 	error = -EISDIR;
1938 	if (nd.last_type != LAST_NORM)
1939 		goto exit;
1940 	if (nd.last.name[nd.last.len]) {
1941 		__putname(nd.last.name);
1942 		goto exit;
1943 	}
1944 	error = -ELOOP;
1945 	if (count++==32) {
1946 		__putname(nd.last.name);
1947 		goto exit;
1948 	}
1949 	dir = nd.path.dentry;
1950 	mutex_lock(&dir->d_inode->i_mutex);
1951 	path.dentry = lookup_hash(&nd);
1952 	path.mnt = nd.path.mnt;
1953 	__putname(nd.last.name);
1954 	goto do_last;
1955 }
1956 
1957 /**
1958  * filp_open - open file and return file pointer
1959  *
1960  * @filename:	path to open
1961  * @flags:	open flags as per the open(2) second argument
1962  * @mode:	mode for the new file if O_CREAT is set, else ignored
1963  *
1964  * This is the helper to open a file from kernelspace if you really
1965  * have to.  But in generally you should not do this, so please move
1966  * along, nothing to see here..
1967  */
1968 struct file *filp_open(const char *filename, int flags, int mode)
1969 {
1970 	return do_filp_open(AT_FDCWD, filename, flags, mode);
1971 }
1972 EXPORT_SYMBOL(filp_open);
1973 
1974 /**
1975  * lookup_create - lookup a dentry, creating it if it doesn't exist
1976  * @nd: nameidata info
1977  * @is_dir: directory flag
1978  *
1979  * Simple function to lookup and return a dentry and create it
1980  * if it doesn't exist.  Is SMP-safe.
1981  *
1982  * Returns with nd->path.dentry->d_inode->i_mutex locked.
1983  */
1984 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1985 {
1986 	struct dentry *dentry = ERR_PTR(-EEXIST);
1987 
1988 	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1989 	/*
1990 	 * Yucky last component or no last component at all?
1991 	 * (foo/., foo/.., /////)
1992 	 */
1993 	if (nd->last_type != LAST_NORM)
1994 		goto fail;
1995 	nd->flags &= ~LOOKUP_PARENT;
1996 	nd->flags |= LOOKUP_CREATE;
1997 	nd->intent.open.flags = O_EXCL;
1998 
1999 	/*
2000 	 * Do the final lookup.
2001 	 */
2002 	dentry = lookup_hash(nd);
2003 	if (IS_ERR(dentry))
2004 		goto fail;
2005 
2006 	if (dentry->d_inode)
2007 		goto eexist;
2008 	/*
2009 	 * Special case - lookup gave negative, but... we had foo/bar/
2010 	 * From the vfs_mknod() POV we just have a negative dentry -
2011 	 * all is fine. Let's be bastards - you had / on the end, you've
2012 	 * been asking for (non-existent) directory. -ENOENT for you.
2013 	 */
2014 	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
2015 		dput(dentry);
2016 		dentry = ERR_PTR(-ENOENT);
2017 	}
2018 	return dentry;
2019 eexist:
2020 	dput(dentry);
2021 	dentry = ERR_PTR(-EEXIST);
2022 fail:
2023 	return dentry;
2024 }
2025 EXPORT_SYMBOL_GPL(lookup_create);
2026 
2027 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
2028 {
2029 	int error = may_create(dir, dentry, NULL);
2030 
2031 	if (error)
2032 		return error;
2033 
2034 	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
2035 		return -EPERM;
2036 
2037 	if (!dir->i_op || !dir->i_op->mknod)
2038 		return -EPERM;
2039 
2040 	error = devcgroup_inode_mknod(mode, dev);
2041 	if (error)
2042 		return error;
2043 
2044 	error = security_inode_mknod(dir, dentry, mode, dev);
2045 	if (error)
2046 		return error;
2047 
2048 	DQUOT_INIT(dir);
2049 	error = dir->i_op->mknod(dir, dentry, mode, dev);
2050 	if (!error)
2051 		fsnotify_create(dir, dentry);
2052 	return error;
2053 }
2054 
2055 static int may_mknod(mode_t mode)
2056 {
2057 	switch (mode & S_IFMT) {
2058 	case S_IFREG:
2059 	case S_IFCHR:
2060 	case S_IFBLK:
2061 	case S_IFIFO:
2062 	case S_IFSOCK:
2063 	case 0: /* zero mode translates to S_IFREG */
2064 		return 0;
2065 	case S_IFDIR:
2066 		return -EPERM;
2067 	default:
2068 		return -EINVAL;
2069 	}
2070 }
2071 
2072 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
2073 				unsigned dev)
2074 {
2075 	int error = 0;
2076 	char * tmp;
2077 	struct dentry * dentry;
2078 	struct nameidata nd;
2079 
2080 	if (S_ISDIR(mode))
2081 		return -EPERM;
2082 	tmp = getname(filename);
2083 	if (IS_ERR(tmp))
2084 		return PTR_ERR(tmp);
2085 
2086 	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
2087 	if (error)
2088 		goto out;
2089 	dentry = lookup_create(&nd, 0);
2090 	if (IS_ERR(dentry)) {
2091 		error = PTR_ERR(dentry);
2092 		goto out_unlock;
2093 	}
2094 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2095 		mode &= ~current->fs->umask;
2096 	error = may_mknod(mode);
2097 	if (error)
2098 		goto out_dput;
2099 	error = mnt_want_write(nd.path.mnt);
2100 	if (error)
2101 		goto out_dput;
2102 	switch (mode & S_IFMT) {
2103 		case 0: case S_IFREG:
2104 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
2105 			break;
2106 		case S_IFCHR: case S_IFBLK:
2107 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
2108 					new_decode_dev(dev));
2109 			break;
2110 		case S_IFIFO: case S_IFSOCK:
2111 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2112 			break;
2113 	}
2114 	mnt_drop_write(nd.path.mnt);
2115 out_dput:
2116 	dput(dentry);
2117 out_unlock:
2118 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2119 	path_put(&nd.path);
2120 out:
2121 	putname(tmp);
2122 
2123 	return error;
2124 }
2125 
2126 asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
2127 {
2128 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
2129 }
2130 
2131 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2132 {
2133 	int error = may_create(dir, dentry, NULL);
2134 
2135 	if (error)
2136 		return error;
2137 
2138 	if (!dir->i_op || !dir->i_op->mkdir)
2139 		return -EPERM;
2140 
2141 	mode &= (S_IRWXUGO|S_ISVTX);
2142 	error = security_inode_mkdir(dir, dentry, mode);
2143 	if (error)
2144 		return error;
2145 
2146 	DQUOT_INIT(dir);
2147 	error = dir->i_op->mkdir(dir, dentry, mode);
2148 	if (!error)
2149 		fsnotify_mkdir(dir, dentry);
2150 	return error;
2151 }
2152 
2153 asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2154 {
2155 	int error = 0;
2156 	char * tmp;
2157 	struct dentry *dentry;
2158 	struct nameidata nd;
2159 
2160 	tmp = getname(pathname);
2161 	error = PTR_ERR(tmp);
2162 	if (IS_ERR(tmp))
2163 		goto out_err;
2164 
2165 	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
2166 	if (error)
2167 		goto out;
2168 	dentry = lookup_create(&nd, 1);
2169 	error = PTR_ERR(dentry);
2170 	if (IS_ERR(dentry))
2171 		goto out_unlock;
2172 
2173 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2174 		mode &= ~current->fs->umask;
2175 	error = mnt_want_write(nd.path.mnt);
2176 	if (error)
2177 		goto out_dput;
2178 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2179 	mnt_drop_write(nd.path.mnt);
2180 out_dput:
2181 	dput(dentry);
2182 out_unlock:
2183 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2184 	path_put(&nd.path);
2185 out:
2186 	putname(tmp);
2187 out_err:
2188 	return error;
2189 }
2190 
2191 asmlinkage long sys_mkdir(const char __user *pathname, int mode)
2192 {
2193 	return sys_mkdirat(AT_FDCWD, pathname, mode);
2194 }
2195 
2196 /*
2197  * We try to drop the dentry early: we should have
2198  * a usage count of 2 if we're the only user of this
2199  * dentry, and if that is true (possibly after pruning
2200  * the dcache), then we drop the dentry now.
2201  *
2202  * A low-level filesystem can, if it choses, legally
2203  * do a
2204  *
2205  *	if (!d_unhashed(dentry))
2206  *		return -EBUSY;
2207  *
2208  * if it cannot handle the case of removing a directory
2209  * that is still in use by something else..
2210  */
2211 void dentry_unhash(struct dentry *dentry)
2212 {
2213 	dget(dentry);
2214 	shrink_dcache_parent(dentry);
2215 	spin_lock(&dcache_lock);
2216 	spin_lock(&dentry->d_lock);
2217 	if (atomic_read(&dentry->d_count) == 2)
2218 		__d_drop(dentry);
2219 	spin_unlock(&dentry->d_lock);
2220 	spin_unlock(&dcache_lock);
2221 }
2222 
2223 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2224 {
2225 	int error = may_delete(dir, dentry, 1);
2226 
2227 	if (error)
2228 		return error;
2229 
2230 	if (!dir->i_op || !dir->i_op->rmdir)
2231 		return -EPERM;
2232 
2233 	DQUOT_INIT(dir);
2234 
2235 	mutex_lock(&dentry->d_inode->i_mutex);
2236 	dentry_unhash(dentry);
2237 	if (d_mountpoint(dentry))
2238 		error = -EBUSY;
2239 	else {
2240 		error = security_inode_rmdir(dir, dentry);
2241 		if (!error) {
2242 			error = dir->i_op->rmdir(dir, dentry);
2243 			if (!error)
2244 				dentry->d_inode->i_flags |= S_DEAD;
2245 		}
2246 	}
2247 	mutex_unlock(&dentry->d_inode->i_mutex);
2248 	if (!error) {
2249 		d_delete(dentry);
2250 	}
2251 	dput(dentry);
2252 
2253 	return error;
2254 }
2255 
2256 static long do_rmdir(int dfd, const char __user *pathname)
2257 {
2258 	int error = 0;
2259 	char * name;
2260 	struct dentry *dentry;
2261 	struct nameidata nd;
2262 
2263 	name = getname(pathname);
2264 	if(IS_ERR(name))
2265 		return PTR_ERR(name);
2266 
2267 	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
2268 	if (error)
2269 		goto exit;
2270 
2271 	switch(nd.last_type) {
2272 		case LAST_DOTDOT:
2273 			error = -ENOTEMPTY;
2274 			goto exit1;
2275 		case LAST_DOT:
2276 			error = -EINVAL;
2277 			goto exit1;
2278 		case LAST_ROOT:
2279 			error = -EBUSY;
2280 			goto exit1;
2281 	}
2282 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2283 	dentry = lookup_hash(&nd);
2284 	error = PTR_ERR(dentry);
2285 	if (IS_ERR(dentry))
2286 		goto exit2;
2287 	error = mnt_want_write(nd.path.mnt);
2288 	if (error)
2289 		goto exit3;
2290 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2291 	mnt_drop_write(nd.path.mnt);
2292 exit3:
2293 	dput(dentry);
2294 exit2:
2295 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2296 exit1:
2297 	path_put(&nd.path);
2298 exit:
2299 	putname(name);
2300 	return error;
2301 }
2302 
2303 asmlinkage long sys_rmdir(const char __user *pathname)
2304 {
2305 	return do_rmdir(AT_FDCWD, pathname);
2306 }
2307 
2308 int vfs_unlink(struct inode *dir, struct dentry *dentry)
2309 {
2310 	int error = may_delete(dir, dentry, 0);
2311 
2312 	if (error)
2313 		return error;
2314 
2315 	if (!dir->i_op || !dir->i_op->unlink)
2316 		return -EPERM;
2317 
2318 	DQUOT_INIT(dir);
2319 
2320 	mutex_lock(&dentry->d_inode->i_mutex);
2321 	if (d_mountpoint(dentry))
2322 		error = -EBUSY;
2323 	else {
2324 		error = security_inode_unlink(dir, dentry);
2325 		if (!error)
2326 			error = dir->i_op->unlink(dir, dentry);
2327 	}
2328 	mutex_unlock(&dentry->d_inode->i_mutex);
2329 
2330 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
2331 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2332 		fsnotify_link_count(dentry->d_inode);
2333 		d_delete(dentry);
2334 	}
2335 
2336 	return error;
2337 }
2338 
2339 /*
2340  * Make sure that the actual truncation of the file will occur outside its
2341  * directory's i_mutex.  Truncate can take a long time if there is a lot of
2342  * writeout happening, and we don't want to prevent access to the directory
2343  * while waiting on the I/O.
2344  */
2345 static long do_unlinkat(int dfd, const char __user *pathname)
2346 {
2347 	int error = 0;
2348 	char * name;
2349 	struct dentry *dentry;
2350 	struct nameidata nd;
2351 	struct inode *inode = NULL;
2352 
2353 	name = getname(pathname);
2354 	if(IS_ERR(name))
2355 		return PTR_ERR(name);
2356 
2357 	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
2358 	if (error)
2359 		goto exit;
2360 	error = -EISDIR;
2361 	if (nd.last_type != LAST_NORM)
2362 		goto exit1;
2363 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2364 	dentry = lookup_hash(&nd);
2365 	error = PTR_ERR(dentry);
2366 	if (!IS_ERR(dentry)) {
2367 		/* Why not before? Because we want correct error value */
2368 		if (nd.last.name[nd.last.len])
2369 			goto slashes;
2370 		inode = dentry->d_inode;
2371 		if (inode)
2372 			atomic_inc(&inode->i_count);
2373 		error = mnt_want_write(nd.path.mnt);
2374 		if (error)
2375 			goto exit2;
2376 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2377 		mnt_drop_write(nd.path.mnt);
2378 	exit2:
2379 		dput(dentry);
2380 	}
2381 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2382 	if (inode)
2383 		iput(inode);	/* truncate the inode here */
2384 exit1:
2385 	path_put(&nd.path);
2386 exit:
2387 	putname(name);
2388 	return error;
2389 
2390 slashes:
2391 	error = !dentry->d_inode ? -ENOENT :
2392 		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2393 	goto exit2;
2394 }
2395 
2396 asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
2397 {
2398 	if ((flag & ~AT_REMOVEDIR) != 0)
2399 		return -EINVAL;
2400 
2401 	if (flag & AT_REMOVEDIR)
2402 		return do_rmdir(dfd, pathname);
2403 
2404 	return do_unlinkat(dfd, pathname);
2405 }
2406 
2407 asmlinkage long sys_unlink(const char __user *pathname)
2408 {
2409 	return do_unlinkat(AT_FDCWD, pathname);
2410 }
2411 
2412 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
2413 {
2414 	int error = may_create(dir, dentry, NULL);
2415 
2416 	if (error)
2417 		return error;
2418 
2419 	if (!dir->i_op || !dir->i_op->symlink)
2420 		return -EPERM;
2421 
2422 	error = security_inode_symlink(dir, dentry, oldname);
2423 	if (error)
2424 		return error;
2425 
2426 	DQUOT_INIT(dir);
2427 	error = dir->i_op->symlink(dir, dentry, oldname);
2428 	if (!error)
2429 		fsnotify_create(dir, dentry);
2430 	return error;
2431 }
2432 
2433 asmlinkage long sys_symlinkat(const char __user *oldname,
2434 			      int newdfd, const char __user *newname)
2435 {
2436 	int error = 0;
2437 	char * from;
2438 	char * to;
2439 	struct dentry *dentry;
2440 	struct nameidata nd;
2441 
2442 	from = getname(oldname);
2443 	if(IS_ERR(from))
2444 		return PTR_ERR(from);
2445 	to = getname(newname);
2446 	error = PTR_ERR(to);
2447 	if (IS_ERR(to))
2448 		goto out_putname;
2449 
2450 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2451 	if (error)
2452 		goto out;
2453 	dentry = lookup_create(&nd, 0);
2454 	error = PTR_ERR(dentry);
2455 	if (IS_ERR(dentry))
2456 		goto out_unlock;
2457 
2458 	error = mnt_want_write(nd.path.mnt);
2459 	if (error)
2460 		goto out_dput;
2461 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
2462 	mnt_drop_write(nd.path.mnt);
2463 out_dput:
2464 	dput(dentry);
2465 out_unlock:
2466 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2467 	path_put(&nd.path);
2468 out:
2469 	putname(to);
2470 out_putname:
2471 	putname(from);
2472 	return error;
2473 }
2474 
2475 asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
2476 {
2477 	return sys_symlinkat(oldname, AT_FDCWD, newname);
2478 }
2479 
2480 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2481 {
2482 	struct inode *inode = old_dentry->d_inode;
2483 	int error;
2484 
2485 	if (!inode)
2486 		return -ENOENT;
2487 
2488 	error = may_create(dir, new_dentry, NULL);
2489 	if (error)
2490 		return error;
2491 
2492 	if (dir->i_sb != inode->i_sb)
2493 		return -EXDEV;
2494 
2495 	/*
2496 	 * A link to an append-only or immutable file cannot be created.
2497 	 */
2498 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2499 		return -EPERM;
2500 	if (!dir->i_op || !dir->i_op->link)
2501 		return -EPERM;
2502 	if (S_ISDIR(old_dentry->d_inode->i_mode))
2503 		return -EPERM;
2504 
2505 	error = security_inode_link(old_dentry, dir, new_dentry);
2506 	if (error)
2507 		return error;
2508 
2509 	mutex_lock(&old_dentry->d_inode->i_mutex);
2510 	DQUOT_INIT(dir);
2511 	error = dir->i_op->link(old_dentry, dir, new_dentry);
2512 	mutex_unlock(&old_dentry->d_inode->i_mutex);
2513 	if (!error)
2514 		fsnotify_link(dir, old_dentry->d_inode, new_dentry);
2515 	return error;
2516 }
2517 
2518 /*
2519  * Hardlinks are often used in delicate situations.  We avoid
2520  * security-related surprises by not following symlinks on the
2521  * newname.  --KAB
2522  *
2523  * We don't follow them on the oldname either to be compatible
2524  * with linux 2.0, and to avoid hard-linking to directories
2525  * and other special files.  --ADM
2526  */
2527 asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2528 			   int newdfd, const char __user *newname,
2529 			   int flags)
2530 {
2531 	struct dentry *new_dentry;
2532 	struct nameidata nd, old_nd;
2533 	int error;
2534 	char * to;
2535 
2536 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2537 		return -EINVAL;
2538 
2539 	to = getname(newname);
2540 	if (IS_ERR(to))
2541 		return PTR_ERR(to);
2542 
2543 	error = __user_walk_fd(olddfd, oldname,
2544 			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2545 			       &old_nd);
2546 	if (error)
2547 		goto exit;
2548 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2549 	if (error)
2550 		goto out;
2551 	error = -EXDEV;
2552 	if (old_nd.path.mnt != nd.path.mnt)
2553 		goto out_release;
2554 	new_dentry = lookup_create(&nd, 0);
2555 	error = PTR_ERR(new_dentry);
2556 	if (IS_ERR(new_dentry))
2557 		goto out_unlock;
2558 	error = mnt_want_write(nd.path.mnt);
2559 	if (error)
2560 		goto out_dput;
2561 	error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
2562 	mnt_drop_write(nd.path.mnt);
2563 out_dput:
2564 	dput(new_dentry);
2565 out_unlock:
2566 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2567 out_release:
2568 	path_put(&nd.path);
2569 out:
2570 	path_put(&old_nd.path);
2571 exit:
2572 	putname(to);
2573 
2574 	return error;
2575 }
2576 
2577 asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
2578 {
2579 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2580 }
2581 
2582 /*
2583  * The worst of all namespace operations - renaming directory. "Perverted"
2584  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2585  * Problems:
2586  *	a) we can get into loop creation. Check is done in is_subdir().
2587  *	b) race potential - two innocent renames can create a loop together.
2588  *	   That's where 4.4 screws up. Current fix: serialization on
2589  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
2590  *	   story.
2591  *	c) we have to lock _three_ objects - parents and victim (if it exists).
2592  *	   And that - after we got ->i_mutex on parents (until then we don't know
2593  *	   whether the target exists).  Solution: try to be smart with locking
2594  *	   order for inodes.  We rely on the fact that tree topology may change
2595  *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
2596  *	   move will be locked.  Thus we can rank directories by the tree
2597  *	   (ancestors first) and rank all non-directories after them.
2598  *	   That works since everybody except rename does "lock parent, lookup,
2599  *	   lock child" and rename is under ->s_vfs_rename_mutex.
2600  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
2601  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
2602  *	   we'd better make sure that there's no link(2) for them.
2603  *	d) some filesystems don't support opened-but-unlinked directories,
2604  *	   either because of layout or because they are not ready to deal with
2605  *	   all cases correctly. The latter will be fixed (taking this sort of
2606  *	   stuff into VFS), but the former is not going away. Solution: the same
2607  *	   trick as in rmdir().
2608  *	e) conversion from fhandle to dentry may come in the wrong moment - when
2609  *	   we are removing the target. Solution: we will have to grab ->i_mutex
2610  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2611  *	   ->i_mutex on parents, which works but leads to some truely excessive
2612  *	   locking].
2613  */
2614 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2615 			  struct inode *new_dir, struct dentry *new_dentry)
2616 {
2617 	int error = 0;
2618 	struct inode *target;
2619 
2620 	/*
2621 	 * If we are going to change the parent - check write permissions,
2622 	 * we'll need to flip '..'.
2623 	 */
2624 	if (new_dir != old_dir) {
2625 		error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
2626 		if (error)
2627 			return error;
2628 	}
2629 
2630 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2631 	if (error)
2632 		return error;
2633 
2634 	target = new_dentry->d_inode;
2635 	if (target) {
2636 		mutex_lock(&target->i_mutex);
2637 		dentry_unhash(new_dentry);
2638 	}
2639 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2640 		error = -EBUSY;
2641 	else
2642 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2643 	if (target) {
2644 		if (!error)
2645 			target->i_flags |= S_DEAD;
2646 		mutex_unlock(&target->i_mutex);
2647 		if (d_unhashed(new_dentry))
2648 			d_rehash(new_dentry);
2649 		dput(new_dentry);
2650 	}
2651 	if (!error)
2652 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2653 			d_move(old_dentry,new_dentry);
2654 	return error;
2655 }
2656 
2657 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2658 			    struct inode *new_dir, struct dentry *new_dentry)
2659 {
2660 	struct inode *target;
2661 	int error;
2662 
2663 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2664 	if (error)
2665 		return error;
2666 
2667 	dget(new_dentry);
2668 	target = new_dentry->d_inode;
2669 	if (target)
2670 		mutex_lock(&target->i_mutex);
2671 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2672 		error = -EBUSY;
2673 	else
2674 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2675 	if (!error) {
2676 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2677 			d_move(old_dentry, new_dentry);
2678 	}
2679 	if (target)
2680 		mutex_unlock(&target->i_mutex);
2681 	dput(new_dentry);
2682 	return error;
2683 }
2684 
2685 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2686 	       struct inode *new_dir, struct dentry *new_dentry)
2687 {
2688 	int error;
2689 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2690 	const char *old_name;
2691 
2692 	if (old_dentry->d_inode == new_dentry->d_inode)
2693  		return 0;
2694 
2695 	error = may_delete(old_dir, old_dentry, is_dir);
2696 	if (error)
2697 		return error;
2698 
2699 	if (!new_dentry->d_inode)
2700 		error = may_create(new_dir, new_dentry, NULL);
2701 	else
2702 		error = may_delete(new_dir, new_dentry, is_dir);
2703 	if (error)
2704 		return error;
2705 
2706 	if (!old_dir->i_op || !old_dir->i_op->rename)
2707 		return -EPERM;
2708 
2709 	DQUOT_INIT(old_dir);
2710 	DQUOT_INIT(new_dir);
2711 
2712 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2713 
2714 	if (is_dir)
2715 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2716 	else
2717 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2718 	if (!error) {
2719 		const char *new_name = old_dentry->d_name.name;
2720 		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2721 			      new_dentry->d_inode, old_dentry);
2722 	}
2723 	fsnotify_oldname_free(old_name);
2724 
2725 	return error;
2726 }
2727 
2728 static int do_rename(int olddfd, const char *oldname,
2729 			int newdfd, const char *newname)
2730 {
2731 	int error = 0;
2732 	struct dentry * old_dir, * new_dir;
2733 	struct dentry * old_dentry, *new_dentry;
2734 	struct dentry * trap;
2735 	struct nameidata oldnd, newnd;
2736 
2737 	error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
2738 	if (error)
2739 		goto exit;
2740 
2741 	error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
2742 	if (error)
2743 		goto exit1;
2744 
2745 	error = -EXDEV;
2746 	if (oldnd.path.mnt != newnd.path.mnt)
2747 		goto exit2;
2748 
2749 	old_dir = oldnd.path.dentry;
2750 	error = -EBUSY;
2751 	if (oldnd.last_type != LAST_NORM)
2752 		goto exit2;
2753 
2754 	new_dir = newnd.path.dentry;
2755 	if (newnd.last_type != LAST_NORM)
2756 		goto exit2;
2757 
2758 	trap = lock_rename(new_dir, old_dir);
2759 
2760 	old_dentry = lookup_hash(&oldnd);
2761 	error = PTR_ERR(old_dentry);
2762 	if (IS_ERR(old_dentry))
2763 		goto exit3;
2764 	/* source must exist */
2765 	error = -ENOENT;
2766 	if (!old_dentry->d_inode)
2767 		goto exit4;
2768 	/* unless the source is a directory trailing slashes give -ENOTDIR */
2769 	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2770 		error = -ENOTDIR;
2771 		if (oldnd.last.name[oldnd.last.len])
2772 			goto exit4;
2773 		if (newnd.last.name[newnd.last.len])
2774 			goto exit4;
2775 	}
2776 	/* source should not be ancestor of target */
2777 	error = -EINVAL;
2778 	if (old_dentry == trap)
2779 		goto exit4;
2780 	new_dentry = lookup_hash(&newnd);
2781 	error = PTR_ERR(new_dentry);
2782 	if (IS_ERR(new_dentry))
2783 		goto exit4;
2784 	/* target should not be an ancestor of source */
2785 	error = -ENOTEMPTY;
2786 	if (new_dentry == trap)
2787 		goto exit5;
2788 
2789 	error = mnt_want_write(oldnd.path.mnt);
2790 	if (error)
2791 		goto exit5;
2792 	error = vfs_rename(old_dir->d_inode, old_dentry,
2793 				   new_dir->d_inode, new_dentry);
2794 	mnt_drop_write(oldnd.path.mnt);
2795 exit5:
2796 	dput(new_dentry);
2797 exit4:
2798 	dput(old_dentry);
2799 exit3:
2800 	unlock_rename(new_dir, old_dir);
2801 exit2:
2802 	path_put(&newnd.path);
2803 exit1:
2804 	path_put(&oldnd.path);
2805 exit:
2806 	return error;
2807 }
2808 
2809 asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2810 			     int newdfd, const char __user *newname)
2811 {
2812 	int error;
2813 	char * from;
2814 	char * to;
2815 
2816 	from = getname(oldname);
2817 	if(IS_ERR(from))
2818 		return PTR_ERR(from);
2819 	to = getname(newname);
2820 	error = PTR_ERR(to);
2821 	if (!IS_ERR(to)) {
2822 		error = do_rename(olddfd, from, newdfd, to);
2823 		putname(to);
2824 	}
2825 	putname(from);
2826 	return error;
2827 }
2828 
2829 asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
2830 {
2831 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2832 }
2833 
2834 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2835 {
2836 	int len;
2837 
2838 	len = PTR_ERR(link);
2839 	if (IS_ERR(link))
2840 		goto out;
2841 
2842 	len = strlen(link);
2843 	if (len > (unsigned) buflen)
2844 		len = buflen;
2845 	if (copy_to_user(buffer, link, len))
2846 		len = -EFAULT;
2847 out:
2848 	return len;
2849 }
2850 
2851 /*
2852  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
2853  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
2854  * using) it for any given inode is up to filesystem.
2855  */
2856 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2857 {
2858 	struct nameidata nd;
2859 	void *cookie;
2860 
2861 	nd.depth = 0;
2862 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
2863 	if (!IS_ERR(cookie)) {
2864 		int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2865 		if (dentry->d_inode->i_op->put_link)
2866 			dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
2867 		cookie = ERR_PTR(res);
2868 	}
2869 	return PTR_ERR(cookie);
2870 }
2871 
2872 int vfs_follow_link(struct nameidata *nd, const char *link)
2873 {
2874 	return __vfs_follow_link(nd, link);
2875 }
2876 
2877 /* get the link contents into pagecache */
2878 static char *page_getlink(struct dentry * dentry, struct page **ppage)
2879 {
2880 	struct page * page;
2881 	struct address_space *mapping = dentry->d_inode->i_mapping;
2882 	page = read_mapping_page(mapping, 0, NULL);
2883 	if (IS_ERR(page))
2884 		return (char*)page;
2885 	*ppage = page;
2886 	return kmap(page);
2887 }
2888 
2889 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2890 {
2891 	struct page *page = NULL;
2892 	char *s = page_getlink(dentry, &page);
2893 	int res = vfs_readlink(dentry,buffer,buflen,s);
2894 	if (page) {
2895 		kunmap(page);
2896 		page_cache_release(page);
2897 	}
2898 	return res;
2899 }
2900 
2901 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2902 {
2903 	struct page *page = NULL;
2904 	nd_set_link(nd, page_getlink(dentry, &page));
2905 	return page;
2906 }
2907 
2908 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2909 {
2910 	struct page *page = cookie;
2911 
2912 	if (page) {
2913 		kunmap(page);
2914 		page_cache_release(page);
2915 	}
2916 }
2917 
2918 int __page_symlink(struct inode *inode, const char *symname, int len,
2919 		gfp_t gfp_mask)
2920 {
2921 	struct address_space *mapping = inode->i_mapping;
2922 	struct page *page;
2923 	void *fsdata;
2924 	int err;
2925 	char *kaddr;
2926 
2927 retry:
2928 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
2929 				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
2930 	if (err)
2931 		goto fail;
2932 
2933 	kaddr = kmap_atomic(page, KM_USER0);
2934 	memcpy(kaddr, symname, len-1);
2935 	kunmap_atomic(kaddr, KM_USER0);
2936 
2937 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2938 							page, fsdata);
2939 	if (err < 0)
2940 		goto fail;
2941 	if (err < len-1)
2942 		goto retry;
2943 
2944 	mark_inode_dirty(inode);
2945 	return 0;
2946 fail:
2947 	return err;
2948 }
2949 
2950 int page_symlink(struct inode *inode, const char *symname, int len)
2951 {
2952 	return __page_symlink(inode, symname, len,
2953 			mapping_gfp_mask(inode->i_mapping));
2954 }
2955 
2956 const struct inode_operations page_symlink_inode_operations = {
2957 	.readlink	= generic_readlink,
2958 	.follow_link	= page_follow_link_light,
2959 	.put_link	= page_put_link,
2960 };
2961 
2962 EXPORT_SYMBOL(__user_walk);
2963 EXPORT_SYMBOL(__user_walk_fd);
2964 EXPORT_SYMBOL(follow_down);
2965 EXPORT_SYMBOL(follow_up);
2966 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2967 EXPORT_SYMBOL(getname);
2968 EXPORT_SYMBOL(lock_rename);
2969 EXPORT_SYMBOL(lookup_one_len);
2970 EXPORT_SYMBOL(page_follow_link_light);
2971 EXPORT_SYMBOL(page_put_link);
2972 EXPORT_SYMBOL(page_readlink);
2973 EXPORT_SYMBOL(__page_symlink);
2974 EXPORT_SYMBOL(page_symlink);
2975 EXPORT_SYMBOL(page_symlink_inode_operations);
2976 EXPORT_SYMBOL(path_lookup);
2977 EXPORT_SYMBOL(vfs_path_lookup);
2978 EXPORT_SYMBOL(permission);
2979 EXPORT_SYMBOL(vfs_permission);
2980 EXPORT_SYMBOL(file_permission);
2981 EXPORT_SYMBOL(unlock_rename);
2982 EXPORT_SYMBOL(vfs_create);
2983 EXPORT_SYMBOL(vfs_follow_link);
2984 EXPORT_SYMBOL(vfs_link);
2985 EXPORT_SYMBOL(vfs_mkdir);
2986 EXPORT_SYMBOL(vfs_mknod);
2987 EXPORT_SYMBOL(generic_permission);
2988 EXPORT_SYMBOL(vfs_readlink);
2989 EXPORT_SYMBOL(vfs_rename);
2990 EXPORT_SYMBOL(vfs_rmdir);
2991 EXPORT_SYMBOL(vfs_symlink);
2992 EXPORT_SYMBOL(vfs_unlink);
2993 EXPORT_SYMBOL(dentry_unhash);
2994 EXPORT_SYMBOL(generic_readlink);
2995