1 /* 2 * linux/fs/namei.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 /* 8 * Some corrections by tytso. 9 */ 10 11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname 12 * lookup logic. 13 */ 14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. 15 */ 16 17 #include <linux/init.h> 18 #include <linux/module.h> 19 #include <linux/slab.h> 20 #include <linux/fs.h> 21 #include <linux/namei.h> 22 #include <linux/quotaops.h> 23 #include <linux/pagemap.h> 24 #include <linux/fsnotify.h> 25 #include <linux/personality.h> 26 #include <linux/security.h> 27 #include <linux/ima.h> 28 #include <linux/syscalls.h> 29 #include <linux/mount.h> 30 #include <linux/audit.h> 31 #include <linux/capability.h> 32 #include <linux/file.h> 33 #include <linux/fcntl.h> 34 #include <linux/device_cgroup.h> 35 #include <linux/fs_struct.h> 36 #include <asm/uaccess.h> 37 38 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) 39 40 /* [Feb-1997 T. Schoebel-Theuer] 41 * Fundamental changes in the pathname lookup mechanisms (namei) 42 * were necessary because of omirr. The reason is that omirr needs 43 * to know the _real_ pathname, not the user-supplied one, in case 44 * of symlinks (and also when transname replacements occur). 45 * 46 * The new code replaces the old recursive symlink resolution with 47 * an iterative one (in case of non-nested symlink chains). It does 48 * this with calls to <fs>_follow_link(). 49 * As a side effect, dir_namei(), _namei() and follow_link() are now 50 * replaced with a single function lookup_dentry() that can handle all 51 * the special cases of the former code. 52 * 53 * With the new dcache, the pathname is stored at each inode, at least as 54 * long as the refcount of the inode is positive. As a side effect, the 55 * size of the dcache depends on the inode cache and thus is dynamic. 56 * 57 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink 58 * resolution to correspond with current state of the code. 59 * 60 * Note that the symlink resolution is not *completely* iterative. 61 * There is still a significant amount of tail- and mid- recursion in 62 * the algorithm. Also, note that <fs>_readlink() is not used in 63 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() 64 * may return different results than <fs>_follow_link(). Many virtual 65 * filesystems (including /proc) exhibit this behavior. 66 */ 67 68 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: 69 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL 70 * and the name already exists in form of a symlink, try to create the new 71 * name indicated by the symlink. The old code always complained that the 72 * name already exists, due to not following the symlink even if its target 73 * is nonexistent. The new semantics affects also mknod() and link() when 74 * the name is a symlink pointing to a non-existant name. 75 * 76 * I don't know which semantics is the right one, since I have no access 77 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 78 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the 79 * "old" one. Personally, I think the new semantics is much more logical. 80 * Note that "ln old new" where "new" is a symlink pointing to a non-existing 81 * file does succeed in both HP-UX and SunOs, but not in Solaris 82 * and in the old Linux semantics. 83 */ 84 85 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink 86 * semantics. See the comments in "open_namei" and "do_link" below. 87 * 88 * [10-Sep-98 Alan Modra] Another symlink change. 89 */ 90 91 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: 92 * inside the path - always follow. 93 * in the last component in creation/removal/renaming - never follow. 94 * if LOOKUP_FOLLOW passed - follow. 95 * if the pathname has trailing slashes - follow. 96 * otherwise - don't follow. 97 * (applied in that order). 98 * 99 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT 100 * restored for 2.4. This is the last surviving part of old 4.2BSD bug. 101 * During the 2.4 we need to fix the userland stuff depending on it - 102 * hopefully we will be able to get rid of that wart in 2.5. So far only 103 * XEmacs seems to be relying on it... 104 */ 105 /* 106 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) 107 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives 108 * any extra contention... 109 */ 110 111 static int __link_path_walk(const char *name, struct nameidata *nd); 112 113 /* In order to reduce some races, while at the same time doing additional 114 * checking and hopefully speeding things up, we copy filenames to the 115 * kernel data space before using them.. 116 * 117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 118 * PATH_MAX includes the nul terminator --RR. 119 */ 120 static int do_getname(const char __user *filename, char *page) 121 { 122 int retval; 123 unsigned long len = PATH_MAX; 124 125 if (!segment_eq(get_fs(), KERNEL_DS)) { 126 if ((unsigned long) filename >= TASK_SIZE) 127 return -EFAULT; 128 if (TASK_SIZE - (unsigned long) filename < PATH_MAX) 129 len = TASK_SIZE - (unsigned long) filename; 130 } 131 132 retval = strncpy_from_user(page, filename, len); 133 if (retval > 0) { 134 if (retval < len) 135 return 0; 136 return -ENAMETOOLONG; 137 } else if (!retval) 138 retval = -ENOENT; 139 return retval; 140 } 141 142 char * getname(const char __user * filename) 143 { 144 char *tmp, *result; 145 146 result = ERR_PTR(-ENOMEM); 147 tmp = __getname(); 148 if (tmp) { 149 int retval = do_getname(filename, tmp); 150 151 result = tmp; 152 if (retval < 0) { 153 __putname(tmp); 154 result = ERR_PTR(retval); 155 } 156 } 157 audit_getname(result); 158 return result; 159 } 160 161 #ifdef CONFIG_AUDITSYSCALL 162 void putname(const char *name) 163 { 164 if (unlikely(!audit_dummy_context())) 165 audit_putname(name); 166 else 167 __putname(name); 168 } 169 EXPORT_SYMBOL(putname); 170 #endif 171 172 173 /** 174 * generic_permission - check for access rights on a Posix-like filesystem 175 * @inode: inode to check access rights for 176 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 177 * @check_acl: optional callback to check for Posix ACLs 178 * 179 * Used to check for read/write/execute permissions on a file. 180 * We use "fsuid" for this, letting us set arbitrary permissions 181 * for filesystem access without changing the "normal" uids which 182 * are used for other things.. 183 */ 184 int generic_permission(struct inode *inode, int mask, 185 int (*check_acl)(struct inode *inode, int mask)) 186 { 187 umode_t mode = inode->i_mode; 188 189 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 190 191 if (current_fsuid() == inode->i_uid) 192 mode >>= 6; 193 else { 194 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 195 int error = check_acl(inode, mask); 196 if (error == -EACCES) 197 goto check_capabilities; 198 else if (error != -EAGAIN) 199 return error; 200 } 201 202 if (in_group_p(inode->i_gid)) 203 mode >>= 3; 204 } 205 206 /* 207 * If the DACs are ok we don't need any capability check. 208 */ 209 if ((mask & ~mode) == 0) 210 return 0; 211 212 check_capabilities: 213 /* 214 * Read/write DACs are always overridable. 215 * Executable DACs are overridable if at least one exec bit is set. 216 */ 217 if (!(mask & MAY_EXEC) || execute_ok(inode)) 218 if (capable(CAP_DAC_OVERRIDE)) 219 return 0; 220 221 /* 222 * Searching includes executable on directories, else just read. 223 */ 224 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 225 if (capable(CAP_DAC_READ_SEARCH)) 226 return 0; 227 228 return -EACCES; 229 } 230 231 /** 232 * inode_permission - check for access rights to a given inode 233 * @inode: inode to check permission on 234 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 235 * 236 * Used to check for read/write/execute permissions on an inode. 237 * We use "fsuid" for this, letting us set arbitrary permissions 238 * for filesystem access without changing the "normal" uids which 239 * are used for other things. 240 */ 241 int inode_permission(struct inode *inode, int mask) 242 { 243 int retval; 244 245 if (mask & MAY_WRITE) { 246 umode_t mode = inode->i_mode; 247 248 /* 249 * Nobody gets write access to a read-only fs. 250 */ 251 if (IS_RDONLY(inode) && 252 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) 253 return -EROFS; 254 255 /* 256 * Nobody gets write access to an immutable file. 257 */ 258 if (IS_IMMUTABLE(inode)) 259 return -EACCES; 260 } 261 262 if (inode->i_op->permission) 263 retval = inode->i_op->permission(inode, mask); 264 else 265 retval = generic_permission(inode, mask, NULL); 266 267 if (retval) 268 return retval; 269 270 retval = devcgroup_inode_permission(inode, mask); 271 if (retval) 272 return retval; 273 274 return security_inode_permission(inode, 275 mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND)); 276 } 277 278 /** 279 * file_permission - check for additional access rights to a given file 280 * @file: file to check access rights for 281 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 282 * 283 * Used to check for read/write/execute permissions on an already opened 284 * file. 285 * 286 * Note: 287 * Do not use this function in new code. All access checks should 288 * be done using inode_permission(). 289 */ 290 int file_permission(struct file *file, int mask) 291 { 292 return inode_permission(file->f_path.dentry->d_inode, mask); 293 } 294 295 /* 296 * get_write_access() gets write permission for a file. 297 * put_write_access() releases this write permission. 298 * This is used for regular files. 299 * We cannot support write (and maybe mmap read-write shared) accesses and 300 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode 301 * can have the following values: 302 * 0: no writers, no VM_DENYWRITE mappings 303 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist 304 * > 0: (i_writecount) users are writing to the file. 305 * 306 * Normally we operate on that counter with atomic_{inc,dec} and it's safe 307 * except for the cases where we don't hold i_writecount yet. Then we need to 308 * use {get,deny}_write_access() - these functions check the sign and refuse 309 * to do the change if sign is wrong. Exclusion between them is provided by 310 * the inode->i_lock spinlock. 311 */ 312 313 int get_write_access(struct inode * inode) 314 { 315 spin_lock(&inode->i_lock); 316 if (atomic_read(&inode->i_writecount) < 0) { 317 spin_unlock(&inode->i_lock); 318 return -ETXTBSY; 319 } 320 atomic_inc(&inode->i_writecount); 321 spin_unlock(&inode->i_lock); 322 323 return 0; 324 } 325 326 int deny_write_access(struct file * file) 327 { 328 struct inode *inode = file->f_path.dentry->d_inode; 329 330 spin_lock(&inode->i_lock); 331 if (atomic_read(&inode->i_writecount) > 0) { 332 spin_unlock(&inode->i_lock); 333 return -ETXTBSY; 334 } 335 atomic_dec(&inode->i_writecount); 336 spin_unlock(&inode->i_lock); 337 338 return 0; 339 } 340 341 /** 342 * path_get - get a reference to a path 343 * @path: path to get the reference to 344 * 345 * Given a path increment the reference count to the dentry and the vfsmount. 346 */ 347 void path_get(struct path *path) 348 { 349 mntget(path->mnt); 350 dget(path->dentry); 351 } 352 EXPORT_SYMBOL(path_get); 353 354 /** 355 * path_put - put a reference to a path 356 * @path: path to put the reference to 357 * 358 * Given a path decrement the reference count to the dentry and the vfsmount. 359 */ 360 void path_put(struct path *path) 361 { 362 dput(path->dentry); 363 mntput(path->mnt); 364 } 365 EXPORT_SYMBOL(path_put); 366 367 /** 368 * release_open_intent - free up open intent resources 369 * @nd: pointer to nameidata 370 */ 371 void release_open_intent(struct nameidata *nd) 372 { 373 if (nd->intent.open.file->f_path.dentry == NULL) 374 put_filp(nd->intent.open.file); 375 else 376 fput(nd->intent.open.file); 377 } 378 379 static inline struct dentry * 380 do_revalidate(struct dentry *dentry, struct nameidata *nd) 381 { 382 int status = dentry->d_op->d_revalidate(dentry, nd); 383 if (unlikely(status <= 0)) { 384 /* 385 * The dentry failed validation. 386 * If d_revalidate returned 0 attempt to invalidate 387 * the dentry otherwise d_revalidate is asking us 388 * to return a fail status. 389 */ 390 if (!status) { 391 if (!d_invalidate(dentry)) { 392 dput(dentry); 393 dentry = NULL; 394 } 395 } else { 396 dput(dentry); 397 dentry = ERR_PTR(status); 398 } 399 } 400 return dentry; 401 } 402 403 /* 404 * Internal lookup() using the new generic dcache. 405 * SMP-safe 406 */ 407 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) 408 { 409 struct dentry * dentry = __d_lookup(parent, name); 410 411 /* lockess __d_lookup may fail due to concurrent d_move() 412 * in some unrelated directory, so try with d_lookup 413 */ 414 if (!dentry) 415 dentry = d_lookup(parent, name); 416 417 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 418 dentry = do_revalidate(dentry, nd); 419 420 return dentry; 421 } 422 423 /* 424 * Short-cut version of permission(), for calling by 425 * path_walk(), when dcache lock is held. Combines parts 426 * of permission() and generic_permission(), and tests ONLY for 427 * MAY_EXEC permission. 428 * 429 * If appropriate, check DAC only. If not appropriate, or 430 * short-cut DAC fails, then call permission() to do more 431 * complete permission check. 432 */ 433 static int exec_permission_lite(struct inode *inode) 434 { 435 umode_t mode = inode->i_mode; 436 437 if (inode->i_op->permission) 438 return -EAGAIN; 439 440 if (current_fsuid() == inode->i_uid) 441 mode >>= 6; 442 else if (in_group_p(inode->i_gid)) 443 mode >>= 3; 444 445 if (mode & MAY_EXEC) 446 goto ok; 447 448 if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE)) 449 goto ok; 450 451 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE)) 452 goto ok; 453 454 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) 455 goto ok; 456 457 return -EACCES; 458 ok: 459 return security_inode_permission(inode, MAY_EXEC); 460 } 461 462 /* 463 * This is called when everything else fails, and we actually have 464 * to go to the low-level filesystem to find out what we should do.. 465 * 466 * We get the directory semaphore, and after getting that we also 467 * make sure that nobody added the entry to the dcache in the meantime.. 468 * SMP-safe 469 */ 470 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) 471 { 472 struct dentry * result; 473 struct inode *dir = parent->d_inode; 474 475 mutex_lock(&dir->i_mutex); 476 /* 477 * First re-do the cached lookup just in case it was created 478 * while we waited for the directory semaphore.. 479 * 480 * FIXME! This could use version numbering or similar to 481 * avoid unnecessary cache lookups. 482 * 483 * The "dcache_lock" is purely to protect the RCU list walker 484 * from concurrent renames at this point (we mustn't get false 485 * negatives from the RCU list walk here, unlike the optimistic 486 * fast walk). 487 * 488 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup 489 */ 490 result = d_lookup(parent, name); 491 if (!result) { 492 struct dentry *dentry; 493 494 /* Don't create child dentry for a dead directory. */ 495 result = ERR_PTR(-ENOENT); 496 if (IS_DEADDIR(dir)) 497 goto out_unlock; 498 499 dentry = d_alloc(parent, name); 500 result = ERR_PTR(-ENOMEM); 501 if (dentry) { 502 result = dir->i_op->lookup(dir, dentry, nd); 503 if (result) 504 dput(dentry); 505 else 506 result = dentry; 507 } 508 out_unlock: 509 mutex_unlock(&dir->i_mutex); 510 return result; 511 } 512 513 /* 514 * Uhhuh! Nasty case: the cache was re-populated while 515 * we waited on the semaphore. Need to revalidate. 516 */ 517 mutex_unlock(&dir->i_mutex); 518 if (result->d_op && result->d_op->d_revalidate) { 519 result = do_revalidate(result, nd); 520 if (!result) 521 result = ERR_PTR(-ENOENT); 522 } 523 return result; 524 } 525 526 /* 527 * Wrapper to retry pathname resolution whenever the underlying 528 * file system returns an ESTALE. 529 * 530 * Retry the whole path once, forcing real lookup requests 531 * instead of relying on the dcache. 532 */ 533 static __always_inline int link_path_walk(const char *name, struct nameidata *nd) 534 { 535 struct path save = nd->path; 536 int result; 537 538 /* make sure the stuff we saved doesn't go away */ 539 path_get(&save); 540 541 result = __link_path_walk(name, nd); 542 if (result == -ESTALE) { 543 /* nd->path had been dropped */ 544 nd->path = save; 545 path_get(&nd->path); 546 nd->flags |= LOOKUP_REVAL; 547 result = __link_path_walk(name, nd); 548 } 549 550 path_put(&save); 551 552 return result; 553 } 554 555 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 556 { 557 int res = 0; 558 char *name; 559 if (IS_ERR(link)) 560 goto fail; 561 562 if (*link == '/') { 563 struct fs_struct *fs = current->fs; 564 565 path_put(&nd->path); 566 567 read_lock(&fs->lock); 568 nd->path = fs->root; 569 path_get(&fs->root); 570 read_unlock(&fs->lock); 571 } 572 573 res = link_path_walk(link, nd); 574 if (nd->depth || res || nd->last_type!=LAST_NORM) 575 return res; 576 /* 577 * If it is an iterative symlinks resolution in open_namei() we 578 * have to copy the last component. And all that crap because of 579 * bloody create() on broken symlinks. Furrfu... 580 */ 581 name = __getname(); 582 if (unlikely(!name)) { 583 path_put(&nd->path); 584 return -ENOMEM; 585 } 586 strcpy(name, nd->last.name); 587 nd->last.name = name; 588 return 0; 589 fail: 590 path_put(&nd->path); 591 return PTR_ERR(link); 592 } 593 594 static void path_put_conditional(struct path *path, struct nameidata *nd) 595 { 596 dput(path->dentry); 597 if (path->mnt != nd->path.mnt) 598 mntput(path->mnt); 599 } 600 601 static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 602 { 603 dput(nd->path.dentry); 604 if (nd->path.mnt != path->mnt) 605 mntput(nd->path.mnt); 606 nd->path.mnt = path->mnt; 607 nd->path.dentry = path->dentry; 608 } 609 610 static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) 611 { 612 int error; 613 void *cookie; 614 struct dentry *dentry = path->dentry; 615 616 touch_atime(path->mnt, dentry); 617 nd_set_link(nd, NULL); 618 619 if (path->mnt != nd->path.mnt) { 620 path_to_nameidata(path, nd); 621 dget(dentry); 622 } 623 mntget(path->mnt); 624 cookie = dentry->d_inode->i_op->follow_link(dentry, nd); 625 error = PTR_ERR(cookie); 626 if (!IS_ERR(cookie)) { 627 char *s = nd_get_link(nd); 628 error = 0; 629 if (s) 630 error = __vfs_follow_link(nd, s); 631 if (dentry->d_inode->i_op->put_link) 632 dentry->d_inode->i_op->put_link(dentry, nd, cookie); 633 } 634 path_put(path); 635 636 return error; 637 } 638 639 /* 640 * This limits recursive symlink follows to 8, while 641 * limiting consecutive symlinks to 40. 642 * 643 * Without that kind of total limit, nasty chains of consecutive 644 * symlinks can cause almost arbitrarily long lookups. 645 */ 646 static inline int do_follow_link(struct path *path, struct nameidata *nd) 647 { 648 int err = -ELOOP; 649 if (current->link_count >= MAX_NESTED_LINKS) 650 goto loop; 651 if (current->total_link_count >= 40) 652 goto loop; 653 BUG_ON(nd->depth >= MAX_NESTED_LINKS); 654 cond_resched(); 655 err = security_inode_follow_link(path->dentry, nd); 656 if (err) 657 goto loop; 658 current->link_count++; 659 current->total_link_count++; 660 nd->depth++; 661 err = __do_follow_link(path, nd); 662 current->link_count--; 663 nd->depth--; 664 return err; 665 loop: 666 path_put_conditional(path, nd); 667 path_put(&nd->path); 668 return err; 669 } 670 671 int follow_up(struct vfsmount **mnt, struct dentry **dentry) 672 { 673 struct vfsmount *parent; 674 struct dentry *mountpoint; 675 spin_lock(&vfsmount_lock); 676 parent=(*mnt)->mnt_parent; 677 if (parent == *mnt) { 678 spin_unlock(&vfsmount_lock); 679 return 0; 680 } 681 mntget(parent); 682 mountpoint=dget((*mnt)->mnt_mountpoint); 683 spin_unlock(&vfsmount_lock); 684 dput(*dentry); 685 *dentry = mountpoint; 686 mntput(*mnt); 687 *mnt = parent; 688 return 1; 689 } 690 691 /* no need for dcache_lock, as serialization is taken care in 692 * namespace.c 693 */ 694 static int __follow_mount(struct path *path) 695 { 696 int res = 0; 697 while (d_mountpoint(path->dentry)) { 698 struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); 699 if (!mounted) 700 break; 701 dput(path->dentry); 702 if (res) 703 mntput(path->mnt); 704 path->mnt = mounted; 705 path->dentry = dget(mounted->mnt_root); 706 res = 1; 707 } 708 return res; 709 } 710 711 static void follow_mount(struct vfsmount **mnt, struct dentry **dentry) 712 { 713 while (d_mountpoint(*dentry)) { 714 struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); 715 if (!mounted) 716 break; 717 dput(*dentry); 718 mntput(*mnt); 719 *mnt = mounted; 720 *dentry = dget(mounted->mnt_root); 721 } 722 } 723 724 /* no need for dcache_lock, as serialization is taken care in 725 * namespace.c 726 */ 727 int follow_down(struct vfsmount **mnt, struct dentry **dentry) 728 { 729 struct vfsmount *mounted; 730 731 mounted = lookup_mnt(*mnt, *dentry); 732 if (mounted) { 733 dput(*dentry); 734 mntput(*mnt); 735 *mnt = mounted; 736 *dentry = dget(mounted->mnt_root); 737 return 1; 738 } 739 return 0; 740 } 741 742 static __always_inline void follow_dotdot(struct nameidata *nd) 743 { 744 struct fs_struct *fs = current->fs; 745 746 while(1) { 747 struct vfsmount *parent; 748 struct dentry *old = nd->path.dentry; 749 750 read_lock(&fs->lock); 751 if (nd->path.dentry == fs->root.dentry && 752 nd->path.mnt == fs->root.mnt) { 753 read_unlock(&fs->lock); 754 break; 755 } 756 read_unlock(&fs->lock); 757 spin_lock(&dcache_lock); 758 if (nd->path.dentry != nd->path.mnt->mnt_root) { 759 nd->path.dentry = dget(nd->path.dentry->d_parent); 760 spin_unlock(&dcache_lock); 761 dput(old); 762 break; 763 } 764 spin_unlock(&dcache_lock); 765 spin_lock(&vfsmount_lock); 766 parent = nd->path.mnt->mnt_parent; 767 if (parent == nd->path.mnt) { 768 spin_unlock(&vfsmount_lock); 769 break; 770 } 771 mntget(parent); 772 nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint); 773 spin_unlock(&vfsmount_lock); 774 dput(old); 775 mntput(nd->path.mnt); 776 nd->path.mnt = parent; 777 } 778 follow_mount(&nd->path.mnt, &nd->path.dentry); 779 } 780 781 /* 782 * It's more convoluted than I'd like it to be, but... it's still fairly 783 * small and for now I'd prefer to have fast path as straight as possible. 784 * It _is_ time-critical. 785 */ 786 static int do_lookup(struct nameidata *nd, struct qstr *name, 787 struct path *path) 788 { 789 struct vfsmount *mnt = nd->path.mnt; 790 struct dentry *dentry = __d_lookup(nd->path.dentry, name); 791 792 if (!dentry) 793 goto need_lookup; 794 if (dentry->d_op && dentry->d_op->d_revalidate) 795 goto need_revalidate; 796 done: 797 path->mnt = mnt; 798 path->dentry = dentry; 799 __follow_mount(path); 800 return 0; 801 802 need_lookup: 803 dentry = real_lookup(nd->path.dentry, name, nd); 804 if (IS_ERR(dentry)) 805 goto fail; 806 goto done; 807 808 need_revalidate: 809 dentry = do_revalidate(dentry, nd); 810 if (!dentry) 811 goto need_lookup; 812 if (IS_ERR(dentry)) 813 goto fail; 814 goto done; 815 816 fail: 817 return PTR_ERR(dentry); 818 } 819 820 /* 821 * Name resolution. 822 * This is the basic name resolution function, turning a pathname into 823 * the final dentry. We expect 'base' to be positive and a directory. 824 * 825 * Returns 0 and nd will have valid dentry and mnt on success. 826 * Returns error and drops reference to input namei data on failure. 827 */ 828 static int __link_path_walk(const char *name, struct nameidata *nd) 829 { 830 struct path next; 831 struct inode *inode; 832 int err; 833 unsigned int lookup_flags = nd->flags; 834 835 while (*name=='/') 836 name++; 837 if (!*name) 838 goto return_reval; 839 840 inode = nd->path.dentry->d_inode; 841 if (nd->depth) 842 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); 843 844 /* At this point we know we have a real path component. */ 845 for(;;) { 846 unsigned long hash; 847 struct qstr this; 848 unsigned int c; 849 850 nd->flags |= LOOKUP_CONTINUE; 851 err = exec_permission_lite(inode); 852 if (err == -EAGAIN) 853 err = inode_permission(nd->path.dentry->d_inode, 854 MAY_EXEC); 855 if (!err) 856 err = ima_path_check(&nd->path, MAY_EXEC); 857 if (err) 858 break; 859 860 this.name = name; 861 c = *(const unsigned char *)name; 862 863 hash = init_name_hash(); 864 do { 865 name++; 866 hash = partial_name_hash(c, hash); 867 c = *(const unsigned char *)name; 868 } while (c && (c != '/')); 869 this.len = name - (const char *) this.name; 870 this.hash = end_name_hash(hash); 871 872 /* remove trailing slashes? */ 873 if (!c) 874 goto last_component; 875 while (*++name == '/'); 876 if (!*name) 877 goto last_with_slashes; 878 879 /* 880 * "." and ".." are special - ".." especially so because it has 881 * to be able to know about the current root directory and 882 * parent relationships. 883 */ 884 if (this.name[0] == '.') switch (this.len) { 885 default: 886 break; 887 case 2: 888 if (this.name[1] != '.') 889 break; 890 follow_dotdot(nd); 891 inode = nd->path.dentry->d_inode; 892 /* fallthrough */ 893 case 1: 894 continue; 895 } 896 /* 897 * See if the low-level filesystem might want 898 * to use its own hash.. 899 */ 900 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { 901 err = nd->path.dentry->d_op->d_hash(nd->path.dentry, 902 &this); 903 if (err < 0) 904 break; 905 } 906 /* This does the actual lookups.. */ 907 err = do_lookup(nd, &this, &next); 908 if (err) 909 break; 910 911 err = -ENOENT; 912 inode = next.dentry->d_inode; 913 if (!inode) 914 goto out_dput; 915 916 if (inode->i_op->follow_link) { 917 err = do_follow_link(&next, nd); 918 if (err) 919 goto return_err; 920 err = -ENOENT; 921 inode = nd->path.dentry->d_inode; 922 if (!inode) 923 break; 924 } else 925 path_to_nameidata(&next, nd); 926 err = -ENOTDIR; 927 if (!inode->i_op->lookup) 928 break; 929 continue; 930 /* here ends the main loop */ 931 932 last_with_slashes: 933 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 934 last_component: 935 /* Clear LOOKUP_CONTINUE iff it was previously unset */ 936 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; 937 if (lookup_flags & LOOKUP_PARENT) 938 goto lookup_parent; 939 if (this.name[0] == '.') switch (this.len) { 940 default: 941 break; 942 case 2: 943 if (this.name[1] != '.') 944 break; 945 follow_dotdot(nd); 946 inode = nd->path.dentry->d_inode; 947 /* fallthrough */ 948 case 1: 949 goto return_reval; 950 } 951 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { 952 err = nd->path.dentry->d_op->d_hash(nd->path.dentry, 953 &this); 954 if (err < 0) 955 break; 956 } 957 err = do_lookup(nd, &this, &next); 958 if (err) 959 break; 960 inode = next.dentry->d_inode; 961 if ((lookup_flags & LOOKUP_FOLLOW) 962 && inode && inode->i_op->follow_link) { 963 err = do_follow_link(&next, nd); 964 if (err) 965 goto return_err; 966 inode = nd->path.dentry->d_inode; 967 } else 968 path_to_nameidata(&next, nd); 969 err = -ENOENT; 970 if (!inode) 971 break; 972 if (lookup_flags & LOOKUP_DIRECTORY) { 973 err = -ENOTDIR; 974 if (!inode->i_op->lookup) 975 break; 976 } 977 goto return_base; 978 lookup_parent: 979 nd->last = this; 980 nd->last_type = LAST_NORM; 981 if (this.name[0] != '.') 982 goto return_base; 983 if (this.len == 1) 984 nd->last_type = LAST_DOT; 985 else if (this.len == 2 && this.name[1] == '.') 986 nd->last_type = LAST_DOTDOT; 987 else 988 goto return_base; 989 return_reval: 990 /* 991 * We bypassed the ordinary revalidation routines. 992 * We may need to check the cached dentry for staleness. 993 */ 994 if (nd->path.dentry && nd->path.dentry->d_sb && 995 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { 996 err = -ESTALE; 997 /* Note: we do not d_invalidate() */ 998 if (!nd->path.dentry->d_op->d_revalidate( 999 nd->path.dentry, nd)) 1000 break; 1001 } 1002 return_base: 1003 return 0; 1004 out_dput: 1005 path_put_conditional(&next, nd); 1006 break; 1007 } 1008 path_put(&nd->path); 1009 return_err: 1010 return err; 1011 } 1012 1013 static int path_walk(const char *name, struct nameidata *nd) 1014 { 1015 current->total_link_count = 0; 1016 return link_path_walk(name, nd); 1017 } 1018 1019 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1020 static int do_path_lookup(int dfd, const char *name, 1021 unsigned int flags, struct nameidata *nd) 1022 { 1023 int retval = 0; 1024 int fput_needed; 1025 struct file *file; 1026 struct fs_struct *fs = current->fs; 1027 1028 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1029 nd->flags = flags; 1030 nd->depth = 0; 1031 1032 if (*name=='/') { 1033 read_lock(&fs->lock); 1034 nd->path = fs->root; 1035 path_get(&fs->root); 1036 read_unlock(&fs->lock); 1037 } else if (dfd == AT_FDCWD) { 1038 read_lock(&fs->lock); 1039 nd->path = fs->pwd; 1040 path_get(&fs->pwd); 1041 read_unlock(&fs->lock); 1042 } else { 1043 struct dentry *dentry; 1044 1045 file = fget_light(dfd, &fput_needed); 1046 retval = -EBADF; 1047 if (!file) 1048 goto out_fail; 1049 1050 dentry = file->f_path.dentry; 1051 1052 retval = -ENOTDIR; 1053 if (!S_ISDIR(dentry->d_inode->i_mode)) 1054 goto fput_fail; 1055 1056 retval = file_permission(file, MAY_EXEC); 1057 if (retval) 1058 goto fput_fail; 1059 1060 nd->path = file->f_path; 1061 path_get(&file->f_path); 1062 1063 fput_light(file, fput_needed); 1064 } 1065 1066 retval = path_walk(name, nd); 1067 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1068 nd->path.dentry->d_inode)) 1069 audit_inode(name, nd->path.dentry); 1070 out_fail: 1071 return retval; 1072 1073 fput_fail: 1074 fput_light(file, fput_needed); 1075 goto out_fail; 1076 } 1077 1078 int path_lookup(const char *name, unsigned int flags, 1079 struct nameidata *nd) 1080 { 1081 return do_path_lookup(AT_FDCWD, name, flags, nd); 1082 } 1083 1084 int kern_path(const char *name, unsigned int flags, struct path *path) 1085 { 1086 struct nameidata nd; 1087 int res = do_path_lookup(AT_FDCWD, name, flags, &nd); 1088 if (!res) 1089 *path = nd.path; 1090 return res; 1091 } 1092 1093 /** 1094 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair 1095 * @dentry: pointer to dentry of the base directory 1096 * @mnt: pointer to vfs mount of the base directory 1097 * @name: pointer to file name 1098 * @flags: lookup flags 1099 * @nd: pointer to nameidata 1100 */ 1101 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, 1102 const char *name, unsigned int flags, 1103 struct nameidata *nd) 1104 { 1105 int retval; 1106 1107 /* same as do_path_lookup */ 1108 nd->last_type = LAST_ROOT; 1109 nd->flags = flags; 1110 nd->depth = 0; 1111 1112 nd->path.dentry = dentry; 1113 nd->path.mnt = mnt; 1114 path_get(&nd->path); 1115 1116 retval = path_walk(name, nd); 1117 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1118 nd->path.dentry->d_inode)) 1119 audit_inode(name, nd->path.dentry); 1120 1121 return retval; 1122 1123 } 1124 1125 /** 1126 * path_lookup_open - lookup a file path with open intent 1127 * @dfd: the directory to use as base, or AT_FDCWD 1128 * @name: pointer to file name 1129 * @lookup_flags: lookup intent flags 1130 * @nd: pointer to nameidata 1131 * @open_flags: open intent flags 1132 */ 1133 static int path_lookup_open(int dfd, const char *name, 1134 unsigned int lookup_flags, struct nameidata *nd, int open_flags) 1135 { 1136 struct file *filp = get_empty_filp(); 1137 int err; 1138 1139 if (filp == NULL) 1140 return -ENFILE; 1141 nd->intent.open.file = filp; 1142 nd->intent.open.flags = open_flags; 1143 nd->intent.open.create_mode = 0; 1144 err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd); 1145 if (IS_ERR(nd->intent.open.file)) { 1146 if (err == 0) { 1147 err = PTR_ERR(nd->intent.open.file); 1148 path_put(&nd->path); 1149 } 1150 } else if (err != 0) 1151 release_open_intent(nd); 1152 return err; 1153 } 1154 1155 static struct dentry *__lookup_hash(struct qstr *name, 1156 struct dentry *base, struct nameidata *nd) 1157 { 1158 struct dentry *dentry; 1159 struct inode *inode; 1160 int err; 1161 1162 inode = base->d_inode; 1163 1164 /* 1165 * See if the low-level filesystem might want 1166 * to use its own hash.. 1167 */ 1168 if (base->d_op && base->d_op->d_hash) { 1169 err = base->d_op->d_hash(base, name); 1170 dentry = ERR_PTR(err); 1171 if (err < 0) 1172 goto out; 1173 } 1174 1175 dentry = cached_lookup(base, name, nd); 1176 if (!dentry) { 1177 struct dentry *new; 1178 1179 /* Don't create child dentry for a dead directory. */ 1180 dentry = ERR_PTR(-ENOENT); 1181 if (IS_DEADDIR(inode)) 1182 goto out; 1183 1184 new = d_alloc(base, name); 1185 dentry = ERR_PTR(-ENOMEM); 1186 if (!new) 1187 goto out; 1188 dentry = inode->i_op->lookup(inode, new, nd); 1189 if (!dentry) 1190 dentry = new; 1191 else 1192 dput(new); 1193 } 1194 out: 1195 return dentry; 1196 } 1197 1198 /* 1199 * Restricted form of lookup. Doesn't follow links, single-component only, 1200 * needs parent already locked. Doesn't follow mounts. 1201 * SMP-safe. 1202 */ 1203 static struct dentry *lookup_hash(struct nameidata *nd) 1204 { 1205 int err; 1206 1207 err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); 1208 if (err) 1209 return ERR_PTR(err); 1210 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1211 } 1212 1213 static int __lookup_one_len(const char *name, struct qstr *this, 1214 struct dentry *base, int len) 1215 { 1216 unsigned long hash; 1217 unsigned int c; 1218 1219 this->name = name; 1220 this->len = len; 1221 if (!len) 1222 return -EACCES; 1223 1224 hash = init_name_hash(); 1225 while (len--) { 1226 c = *(const unsigned char *)name++; 1227 if (c == '/' || c == '\0') 1228 return -EACCES; 1229 hash = partial_name_hash(c, hash); 1230 } 1231 this->hash = end_name_hash(hash); 1232 return 0; 1233 } 1234 1235 /** 1236 * lookup_one_len - filesystem helper to lookup single pathname component 1237 * @name: pathname component to lookup 1238 * @base: base directory to lookup from 1239 * @len: maximum length @len should be interpreted to 1240 * 1241 * Note that this routine is purely a helper for filesystem usage and should 1242 * not be called by generic code. Also note that by using this function the 1243 * nameidata argument is passed to the filesystem methods and a filesystem 1244 * using this helper needs to be prepared for that. 1245 */ 1246 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1247 { 1248 int err; 1249 struct qstr this; 1250 1251 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1252 1253 err = __lookup_one_len(name, &this, base, len); 1254 if (err) 1255 return ERR_PTR(err); 1256 1257 err = inode_permission(base->d_inode, MAY_EXEC); 1258 if (err) 1259 return ERR_PTR(err); 1260 return __lookup_hash(&this, base, NULL); 1261 } 1262 1263 /** 1264 * lookup_one_noperm - bad hack for sysfs 1265 * @name: pathname component to lookup 1266 * @base: base directory to lookup from 1267 * 1268 * This is a variant of lookup_one_len that doesn't perform any permission 1269 * checks. It's a horrible hack to work around the braindead sysfs 1270 * architecture and should not be used anywhere else. 1271 * 1272 * DON'T USE THIS FUNCTION EVER, thanks. 1273 */ 1274 struct dentry *lookup_one_noperm(const char *name, struct dentry *base) 1275 { 1276 int err; 1277 struct qstr this; 1278 1279 err = __lookup_one_len(name, &this, base, strlen(name)); 1280 if (err) 1281 return ERR_PTR(err); 1282 return __lookup_hash(&this, base, NULL); 1283 } 1284 1285 int user_path_at(int dfd, const char __user *name, unsigned flags, 1286 struct path *path) 1287 { 1288 struct nameidata nd; 1289 char *tmp = getname(name); 1290 int err = PTR_ERR(tmp); 1291 if (!IS_ERR(tmp)) { 1292 1293 BUG_ON(flags & LOOKUP_PARENT); 1294 1295 err = do_path_lookup(dfd, tmp, flags, &nd); 1296 putname(tmp); 1297 if (!err) 1298 *path = nd.path; 1299 } 1300 return err; 1301 } 1302 1303 static int user_path_parent(int dfd, const char __user *path, 1304 struct nameidata *nd, char **name) 1305 { 1306 char *s = getname(path); 1307 int error; 1308 1309 if (IS_ERR(s)) 1310 return PTR_ERR(s); 1311 1312 error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); 1313 if (error) 1314 putname(s); 1315 else 1316 *name = s; 1317 1318 return error; 1319 } 1320 1321 /* 1322 * It's inline, so penalty for filesystems that don't use sticky bit is 1323 * minimal. 1324 */ 1325 static inline int check_sticky(struct inode *dir, struct inode *inode) 1326 { 1327 uid_t fsuid = current_fsuid(); 1328 1329 if (!(dir->i_mode & S_ISVTX)) 1330 return 0; 1331 if (inode->i_uid == fsuid) 1332 return 0; 1333 if (dir->i_uid == fsuid) 1334 return 0; 1335 return !capable(CAP_FOWNER); 1336 } 1337 1338 /* 1339 * Check whether we can remove a link victim from directory dir, check 1340 * whether the type of victim is right. 1341 * 1. We can't do it if dir is read-only (done in permission()) 1342 * 2. We should have write and exec permissions on dir 1343 * 3. We can't remove anything from append-only dir 1344 * 4. We can't do anything with immutable dir (done in permission()) 1345 * 5. If the sticky bit on dir is set we should either 1346 * a. be owner of dir, or 1347 * b. be owner of victim, or 1348 * c. have CAP_FOWNER capability 1349 * 6. If the victim is append-only or immutable we can't do antyhing with 1350 * links pointing to it. 1351 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 1352 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 1353 * 9. We can't remove a root or mountpoint. 1354 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 1355 * nfs_async_unlink(). 1356 */ 1357 static int may_delete(struct inode *dir,struct dentry *victim,int isdir) 1358 { 1359 int error; 1360 1361 if (!victim->d_inode) 1362 return -ENOENT; 1363 1364 BUG_ON(victim->d_parent->d_inode != dir); 1365 audit_inode_child(victim->d_name.name, victim, dir); 1366 1367 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 1368 if (error) 1369 return error; 1370 if (IS_APPEND(dir)) 1371 return -EPERM; 1372 if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| 1373 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 1374 return -EPERM; 1375 if (isdir) { 1376 if (!S_ISDIR(victim->d_inode->i_mode)) 1377 return -ENOTDIR; 1378 if (IS_ROOT(victim)) 1379 return -EBUSY; 1380 } else if (S_ISDIR(victim->d_inode->i_mode)) 1381 return -EISDIR; 1382 if (IS_DEADDIR(dir)) 1383 return -ENOENT; 1384 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 1385 return -EBUSY; 1386 return 0; 1387 } 1388 1389 /* Check whether we can create an object with dentry child in directory 1390 * dir. 1391 * 1. We can't do it if child already exists (open has special treatment for 1392 * this case, but since we are inlined it's OK) 1393 * 2. We can't do it if dir is read-only (done in permission()) 1394 * 3. We should have write and exec permissions on dir 1395 * 4. We can't do it if dir is immutable (done in permission()) 1396 */ 1397 static inline int may_create(struct inode *dir, struct dentry *child) 1398 { 1399 if (child->d_inode) 1400 return -EEXIST; 1401 if (IS_DEADDIR(dir)) 1402 return -ENOENT; 1403 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 1404 } 1405 1406 /* 1407 * O_DIRECTORY translates into forcing a directory lookup. 1408 */ 1409 static inline int lookup_flags(unsigned int f) 1410 { 1411 unsigned long retval = LOOKUP_FOLLOW; 1412 1413 if (f & O_NOFOLLOW) 1414 retval &= ~LOOKUP_FOLLOW; 1415 1416 if (f & O_DIRECTORY) 1417 retval |= LOOKUP_DIRECTORY; 1418 1419 return retval; 1420 } 1421 1422 /* 1423 * p1 and p2 should be directories on the same fs. 1424 */ 1425 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) 1426 { 1427 struct dentry *p; 1428 1429 if (p1 == p2) { 1430 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 1431 return NULL; 1432 } 1433 1434 mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 1435 1436 p = d_ancestor(p2, p1); 1437 if (p) { 1438 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); 1439 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); 1440 return p; 1441 } 1442 1443 p = d_ancestor(p1, p2); 1444 if (p) { 1445 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 1446 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 1447 return p; 1448 } 1449 1450 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 1451 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 1452 return NULL; 1453 } 1454 1455 void unlock_rename(struct dentry *p1, struct dentry *p2) 1456 { 1457 mutex_unlock(&p1->d_inode->i_mutex); 1458 if (p1 != p2) { 1459 mutex_unlock(&p2->d_inode->i_mutex); 1460 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 1461 } 1462 } 1463 1464 int vfs_create(struct inode *dir, struct dentry *dentry, int mode, 1465 struct nameidata *nd) 1466 { 1467 int error = may_create(dir, dentry); 1468 1469 if (error) 1470 return error; 1471 1472 if (!dir->i_op->create) 1473 return -EACCES; /* shouldn't it be ENOSYS? */ 1474 mode &= S_IALLUGO; 1475 mode |= S_IFREG; 1476 error = security_inode_create(dir, dentry, mode); 1477 if (error) 1478 return error; 1479 vfs_dq_init(dir); 1480 error = dir->i_op->create(dir, dentry, mode, nd); 1481 if (!error) 1482 fsnotify_create(dir, dentry); 1483 return error; 1484 } 1485 1486 int may_open(struct path *path, int acc_mode, int flag) 1487 { 1488 struct dentry *dentry = path->dentry; 1489 struct inode *inode = dentry->d_inode; 1490 int error; 1491 1492 if (!inode) 1493 return -ENOENT; 1494 1495 switch (inode->i_mode & S_IFMT) { 1496 case S_IFLNK: 1497 return -ELOOP; 1498 case S_IFDIR: 1499 if (acc_mode & MAY_WRITE) 1500 return -EISDIR; 1501 break; 1502 case S_IFBLK: 1503 case S_IFCHR: 1504 if (path->mnt->mnt_flags & MNT_NODEV) 1505 return -EACCES; 1506 /*FALLTHRU*/ 1507 case S_IFIFO: 1508 case S_IFSOCK: 1509 flag &= ~O_TRUNC; 1510 break; 1511 } 1512 1513 error = inode_permission(inode, acc_mode); 1514 if (error) 1515 return error; 1516 1517 error = ima_path_check(path, 1518 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); 1519 if (error) 1520 return error; 1521 /* 1522 * An append-only file must be opened in append mode for writing. 1523 */ 1524 if (IS_APPEND(inode)) { 1525 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1526 return -EPERM; 1527 if (flag & O_TRUNC) 1528 return -EPERM; 1529 } 1530 1531 /* O_NOATIME can only be set by the owner or superuser */ 1532 if (flag & O_NOATIME) 1533 if (!is_owner_or_cap(inode)) 1534 return -EPERM; 1535 1536 /* 1537 * Ensure there are no outstanding leases on the file. 1538 */ 1539 error = break_lease(inode, flag); 1540 if (error) 1541 return error; 1542 1543 if (flag & O_TRUNC) { 1544 error = get_write_access(inode); 1545 if (error) 1546 return error; 1547 1548 /* 1549 * Refuse to truncate files with mandatory locks held on them. 1550 */ 1551 error = locks_verify_locked(inode); 1552 if (!error) 1553 error = security_path_truncate(path, 0, 1554 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); 1555 if (!error) { 1556 vfs_dq_init(inode); 1557 1558 error = do_truncate(dentry, 0, 1559 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 1560 NULL); 1561 } 1562 put_write_access(inode); 1563 if (error) 1564 return error; 1565 } else 1566 if (flag & FMODE_WRITE) 1567 vfs_dq_init(inode); 1568 1569 return 0; 1570 } 1571 1572 /* 1573 * Be careful about ever adding any more callers of this 1574 * function. Its flags must be in the namei format, not 1575 * what get passed to sys_open(). 1576 */ 1577 static int __open_namei_create(struct nameidata *nd, struct path *path, 1578 int flag, int mode) 1579 { 1580 int error; 1581 struct dentry *dir = nd->path.dentry; 1582 1583 if (!IS_POSIXACL(dir->d_inode)) 1584 mode &= ~current_umask(); 1585 error = security_path_mknod(&nd->path, path->dentry, mode, 0); 1586 if (error) 1587 goto out_unlock; 1588 error = vfs_create(dir->d_inode, path->dentry, mode, nd); 1589 out_unlock: 1590 mutex_unlock(&dir->d_inode->i_mutex); 1591 dput(nd->path.dentry); 1592 nd->path.dentry = path->dentry; 1593 if (error) 1594 return error; 1595 /* Don't check for write permission, don't truncate */ 1596 return may_open(&nd->path, 0, flag & ~O_TRUNC); 1597 } 1598 1599 /* 1600 * Note that while the flag value (low two bits) for sys_open means: 1601 * 00 - read-only 1602 * 01 - write-only 1603 * 10 - read-write 1604 * 11 - special 1605 * it is changed into 1606 * 00 - no permissions needed 1607 * 01 - read-permission 1608 * 10 - write-permission 1609 * 11 - read-write 1610 * for the internal routines (ie open_namei()/follow_link() etc) 1611 * This is more logical, and also allows the 00 "no perm needed" 1612 * to be used for symlinks (where the permissions are checked 1613 * later). 1614 * 1615 */ 1616 static inline int open_to_namei_flags(int flag) 1617 { 1618 if ((flag+1) & O_ACCMODE) 1619 flag++; 1620 return flag; 1621 } 1622 1623 static int open_will_write_to_fs(int flag, struct inode *inode) 1624 { 1625 /* 1626 * We'll never write to the fs underlying 1627 * a device file. 1628 */ 1629 if (special_file(inode->i_mode)) 1630 return 0; 1631 return (flag & O_TRUNC); 1632 } 1633 1634 /* 1635 * Note that the low bits of the passed in "open_flag" 1636 * are not the same as in the local variable "flag". See 1637 * open_to_namei_flags() for more details. 1638 */ 1639 struct file *do_filp_open(int dfd, const char *pathname, 1640 int open_flag, int mode, int acc_mode) 1641 { 1642 struct file *filp; 1643 struct nameidata nd; 1644 int error; 1645 struct path path; 1646 struct dentry *dir; 1647 int count = 0; 1648 int will_write; 1649 int flag = open_to_namei_flags(open_flag); 1650 1651 if (!acc_mode) 1652 acc_mode = MAY_OPEN | ACC_MODE(flag); 1653 1654 /* O_TRUNC implies we need access checks for write permissions */ 1655 if (flag & O_TRUNC) 1656 acc_mode |= MAY_WRITE; 1657 1658 /* Allow the LSM permission hook to distinguish append 1659 access from general write access. */ 1660 if (flag & O_APPEND) 1661 acc_mode |= MAY_APPEND; 1662 1663 /* 1664 * The simplest case - just a plain lookup. 1665 */ 1666 if (!(flag & O_CREAT)) { 1667 error = path_lookup_open(dfd, pathname, lookup_flags(flag), 1668 &nd, flag); 1669 if (error) 1670 return ERR_PTR(error); 1671 goto ok; 1672 } 1673 1674 /* 1675 * Create - we need to know the parent. 1676 */ 1677 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 1678 if (error) 1679 return ERR_PTR(error); 1680 1681 /* 1682 * We have the parent and last component. First of all, check 1683 * that we are not asked to creat(2) an obvious directory - that 1684 * will not do. 1685 */ 1686 error = -EISDIR; 1687 if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) 1688 goto exit_parent; 1689 1690 error = -ENFILE; 1691 filp = get_empty_filp(); 1692 if (filp == NULL) 1693 goto exit_parent; 1694 nd.intent.open.file = filp; 1695 nd.intent.open.flags = flag; 1696 nd.intent.open.create_mode = mode; 1697 dir = nd.path.dentry; 1698 nd.flags &= ~LOOKUP_PARENT; 1699 nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; 1700 if (flag & O_EXCL) 1701 nd.flags |= LOOKUP_EXCL; 1702 mutex_lock(&dir->d_inode->i_mutex); 1703 path.dentry = lookup_hash(&nd); 1704 path.mnt = nd.path.mnt; 1705 1706 do_last: 1707 error = PTR_ERR(path.dentry); 1708 if (IS_ERR(path.dentry)) { 1709 mutex_unlock(&dir->d_inode->i_mutex); 1710 goto exit; 1711 } 1712 1713 if (IS_ERR(nd.intent.open.file)) { 1714 error = PTR_ERR(nd.intent.open.file); 1715 goto exit_mutex_unlock; 1716 } 1717 1718 /* Negative dentry, just create the file */ 1719 if (!path.dentry->d_inode) { 1720 /* 1721 * This write is needed to ensure that a 1722 * ro->rw transition does not occur between 1723 * the time when the file is created and when 1724 * a permanent write count is taken through 1725 * the 'struct file' in nameidata_to_filp(). 1726 */ 1727 error = mnt_want_write(nd.path.mnt); 1728 if (error) 1729 goto exit_mutex_unlock; 1730 error = __open_namei_create(&nd, &path, flag, mode); 1731 if (error) { 1732 mnt_drop_write(nd.path.mnt); 1733 goto exit; 1734 } 1735 filp = nameidata_to_filp(&nd, open_flag); 1736 mnt_drop_write(nd.path.mnt); 1737 return filp; 1738 } 1739 1740 /* 1741 * It already exists. 1742 */ 1743 mutex_unlock(&dir->d_inode->i_mutex); 1744 audit_inode(pathname, path.dentry); 1745 1746 error = -EEXIST; 1747 if (flag & O_EXCL) 1748 goto exit_dput; 1749 1750 if (__follow_mount(&path)) { 1751 error = -ELOOP; 1752 if (flag & O_NOFOLLOW) 1753 goto exit_dput; 1754 } 1755 1756 error = -ENOENT; 1757 if (!path.dentry->d_inode) 1758 goto exit_dput; 1759 if (path.dentry->d_inode->i_op->follow_link) 1760 goto do_link; 1761 1762 path_to_nameidata(&path, &nd); 1763 error = -EISDIR; 1764 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) 1765 goto exit; 1766 ok: 1767 /* 1768 * Consider: 1769 * 1. may_open() truncates a file 1770 * 2. a rw->ro mount transition occurs 1771 * 3. nameidata_to_filp() fails due to 1772 * the ro mount. 1773 * That would be inconsistent, and should 1774 * be avoided. Taking this mnt write here 1775 * ensures that (2) can not occur. 1776 */ 1777 will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); 1778 if (will_write) { 1779 error = mnt_want_write(nd.path.mnt); 1780 if (error) 1781 goto exit; 1782 } 1783 error = may_open(&nd.path, acc_mode, flag); 1784 if (error) { 1785 if (will_write) 1786 mnt_drop_write(nd.path.mnt); 1787 goto exit; 1788 } 1789 filp = nameidata_to_filp(&nd, open_flag); 1790 /* 1791 * It is now safe to drop the mnt write 1792 * because the filp has had a write taken 1793 * on its behalf. 1794 */ 1795 if (will_write) 1796 mnt_drop_write(nd.path.mnt); 1797 return filp; 1798 1799 exit_mutex_unlock: 1800 mutex_unlock(&dir->d_inode->i_mutex); 1801 exit_dput: 1802 path_put_conditional(&path, &nd); 1803 exit: 1804 if (!IS_ERR(nd.intent.open.file)) 1805 release_open_intent(&nd); 1806 exit_parent: 1807 path_put(&nd.path); 1808 return ERR_PTR(error); 1809 1810 do_link: 1811 error = -ELOOP; 1812 if (flag & O_NOFOLLOW) 1813 goto exit_dput; 1814 /* 1815 * This is subtle. Instead of calling do_follow_link() we do the 1816 * thing by hands. The reason is that this way we have zero link_count 1817 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. 1818 * After that we have the parent and last component, i.e. 1819 * we are in the same situation as after the first path_walk(). 1820 * Well, almost - if the last component is normal we get its copy 1821 * stored in nd->last.name and we will have to putname() it when we 1822 * are done. Procfs-like symlinks just set LAST_BIND. 1823 */ 1824 nd.flags |= LOOKUP_PARENT; 1825 error = security_inode_follow_link(path.dentry, &nd); 1826 if (error) 1827 goto exit_dput; 1828 error = __do_follow_link(&path, &nd); 1829 if (error) { 1830 /* Does someone understand code flow here? Or it is only 1831 * me so stupid? Anathema to whoever designed this non-sense 1832 * with "intent.open". 1833 */ 1834 release_open_intent(&nd); 1835 return ERR_PTR(error); 1836 } 1837 nd.flags &= ~LOOKUP_PARENT; 1838 if (nd.last_type == LAST_BIND) 1839 goto ok; 1840 error = -EISDIR; 1841 if (nd.last_type != LAST_NORM) 1842 goto exit; 1843 if (nd.last.name[nd.last.len]) { 1844 __putname(nd.last.name); 1845 goto exit; 1846 } 1847 error = -ELOOP; 1848 if (count++==32) { 1849 __putname(nd.last.name); 1850 goto exit; 1851 } 1852 dir = nd.path.dentry; 1853 mutex_lock(&dir->d_inode->i_mutex); 1854 path.dentry = lookup_hash(&nd); 1855 path.mnt = nd.path.mnt; 1856 __putname(nd.last.name); 1857 goto do_last; 1858 } 1859 1860 /** 1861 * filp_open - open file and return file pointer 1862 * 1863 * @filename: path to open 1864 * @flags: open flags as per the open(2) second argument 1865 * @mode: mode for the new file if O_CREAT is set, else ignored 1866 * 1867 * This is the helper to open a file from kernelspace if you really 1868 * have to. But in generally you should not do this, so please move 1869 * along, nothing to see here.. 1870 */ 1871 struct file *filp_open(const char *filename, int flags, int mode) 1872 { 1873 return do_filp_open(AT_FDCWD, filename, flags, mode, 0); 1874 } 1875 EXPORT_SYMBOL(filp_open); 1876 1877 /** 1878 * lookup_create - lookup a dentry, creating it if it doesn't exist 1879 * @nd: nameidata info 1880 * @is_dir: directory flag 1881 * 1882 * Simple function to lookup and return a dentry and create it 1883 * if it doesn't exist. Is SMP-safe. 1884 * 1885 * Returns with nd->path.dentry->d_inode->i_mutex locked. 1886 */ 1887 struct dentry *lookup_create(struct nameidata *nd, int is_dir) 1888 { 1889 struct dentry *dentry = ERR_PTR(-EEXIST); 1890 1891 mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 1892 /* 1893 * Yucky last component or no last component at all? 1894 * (foo/., foo/.., /////) 1895 */ 1896 if (nd->last_type != LAST_NORM) 1897 goto fail; 1898 nd->flags &= ~LOOKUP_PARENT; 1899 nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL; 1900 nd->intent.open.flags = O_EXCL; 1901 1902 /* 1903 * Do the final lookup. 1904 */ 1905 dentry = lookup_hash(nd); 1906 if (IS_ERR(dentry)) 1907 goto fail; 1908 1909 if (dentry->d_inode) 1910 goto eexist; 1911 /* 1912 * Special case - lookup gave negative, but... we had foo/bar/ 1913 * From the vfs_mknod() POV we just have a negative dentry - 1914 * all is fine. Let's be bastards - you had / on the end, you've 1915 * been asking for (non-existent) directory. -ENOENT for you. 1916 */ 1917 if (unlikely(!is_dir && nd->last.name[nd->last.len])) { 1918 dput(dentry); 1919 dentry = ERR_PTR(-ENOENT); 1920 } 1921 return dentry; 1922 eexist: 1923 dput(dentry); 1924 dentry = ERR_PTR(-EEXIST); 1925 fail: 1926 return dentry; 1927 } 1928 EXPORT_SYMBOL_GPL(lookup_create); 1929 1930 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 1931 { 1932 int error = may_create(dir, dentry); 1933 1934 if (error) 1935 return error; 1936 1937 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 1938 return -EPERM; 1939 1940 if (!dir->i_op->mknod) 1941 return -EPERM; 1942 1943 error = devcgroup_inode_mknod(mode, dev); 1944 if (error) 1945 return error; 1946 1947 error = security_inode_mknod(dir, dentry, mode, dev); 1948 if (error) 1949 return error; 1950 1951 vfs_dq_init(dir); 1952 error = dir->i_op->mknod(dir, dentry, mode, dev); 1953 if (!error) 1954 fsnotify_create(dir, dentry); 1955 return error; 1956 } 1957 1958 static int may_mknod(mode_t mode) 1959 { 1960 switch (mode & S_IFMT) { 1961 case S_IFREG: 1962 case S_IFCHR: 1963 case S_IFBLK: 1964 case S_IFIFO: 1965 case S_IFSOCK: 1966 case 0: /* zero mode translates to S_IFREG */ 1967 return 0; 1968 case S_IFDIR: 1969 return -EPERM; 1970 default: 1971 return -EINVAL; 1972 } 1973 } 1974 1975 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, 1976 unsigned, dev) 1977 { 1978 int error; 1979 char *tmp; 1980 struct dentry *dentry; 1981 struct nameidata nd; 1982 1983 if (S_ISDIR(mode)) 1984 return -EPERM; 1985 1986 error = user_path_parent(dfd, filename, &nd, &tmp); 1987 if (error) 1988 return error; 1989 1990 dentry = lookup_create(&nd, 0); 1991 if (IS_ERR(dentry)) { 1992 error = PTR_ERR(dentry); 1993 goto out_unlock; 1994 } 1995 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 1996 mode &= ~current_umask(); 1997 error = may_mknod(mode); 1998 if (error) 1999 goto out_dput; 2000 error = mnt_want_write(nd.path.mnt); 2001 if (error) 2002 goto out_dput; 2003 error = security_path_mknod(&nd.path, dentry, mode, dev); 2004 if (error) 2005 goto out_drop_write; 2006 switch (mode & S_IFMT) { 2007 case 0: case S_IFREG: 2008 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); 2009 break; 2010 case S_IFCHR: case S_IFBLK: 2011 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode, 2012 new_decode_dev(dev)); 2013 break; 2014 case S_IFIFO: case S_IFSOCK: 2015 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); 2016 break; 2017 } 2018 out_drop_write: 2019 mnt_drop_write(nd.path.mnt); 2020 out_dput: 2021 dput(dentry); 2022 out_unlock: 2023 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2024 path_put(&nd.path); 2025 putname(tmp); 2026 2027 return error; 2028 } 2029 2030 SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev) 2031 { 2032 return sys_mknodat(AT_FDCWD, filename, mode, dev); 2033 } 2034 2035 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2036 { 2037 int error = may_create(dir, dentry); 2038 2039 if (error) 2040 return error; 2041 2042 if (!dir->i_op->mkdir) 2043 return -EPERM; 2044 2045 mode &= (S_IRWXUGO|S_ISVTX); 2046 error = security_inode_mkdir(dir, dentry, mode); 2047 if (error) 2048 return error; 2049 2050 vfs_dq_init(dir); 2051 error = dir->i_op->mkdir(dir, dentry, mode); 2052 if (!error) 2053 fsnotify_mkdir(dir, dentry); 2054 return error; 2055 } 2056 2057 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) 2058 { 2059 int error = 0; 2060 char * tmp; 2061 struct dentry *dentry; 2062 struct nameidata nd; 2063 2064 error = user_path_parent(dfd, pathname, &nd, &tmp); 2065 if (error) 2066 goto out_err; 2067 2068 dentry = lookup_create(&nd, 1); 2069 error = PTR_ERR(dentry); 2070 if (IS_ERR(dentry)) 2071 goto out_unlock; 2072 2073 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2074 mode &= ~current_umask(); 2075 error = mnt_want_write(nd.path.mnt); 2076 if (error) 2077 goto out_dput; 2078 error = security_path_mkdir(&nd.path, dentry, mode); 2079 if (error) 2080 goto out_drop_write; 2081 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); 2082 out_drop_write: 2083 mnt_drop_write(nd.path.mnt); 2084 out_dput: 2085 dput(dentry); 2086 out_unlock: 2087 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2088 path_put(&nd.path); 2089 putname(tmp); 2090 out_err: 2091 return error; 2092 } 2093 2094 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) 2095 { 2096 return sys_mkdirat(AT_FDCWD, pathname, mode); 2097 } 2098 2099 /* 2100 * We try to drop the dentry early: we should have 2101 * a usage count of 2 if we're the only user of this 2102 * dentry, and if that is true (possibly after pruning 2103 * the dcache), then we drop the dentry now. 2104 * 2105 * A low-level filesystem can, if it choses, legally 2106 * do a 2107 * 2108 * if (!d_unhashed(dentry)) 2109 * return -EBUSY; 2110 * 2111 * if it cannot handle the case of removing a directory 2112 * that is still in use by something else.. 2113 */ 2114 void dentry_unhash(struct dentry *dentry) 2115 { 2116 dget(dentry); 2117 shrink_dcache_parent(dentry); 2118 spin_lock(&dcache_lock); 2119 spin_lock(&dentry->d_lock); 2120 if (atomic_read(&dentry->d_count) == 2) 2121 __d_drop(dentry); 2122 spin_unlock(&dentry->d_lock); 2123 spin_unlock(&dcache_lock); 2124 } 2125 2126 int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2127 { 2128 int error = may_delete(dir, dentry, 1); 2129 2130 if (error) 2131 return error; 2132 2133 if (!dir->i_op->rmdir) 2134 return -EPERM; 2135 2136 vfs_dq_init(dir); 2137 2138 mutex_lock(&dentry->d_inode->i_mutex); 2139 dentry_unhash(dentry); 2140 if (d_mountpoint(dentry)) 2141 error = -EBUSY; 2142 else { 2143 error = security_inode_rmdir(dir, dentry); 2144 if (!error) { 2145 error = dir->i_op->rmdir(dir, dentry); 2146 if (!error) 2147 dentry->d_inode->i_flags |= S_DEAD; 2148 } 2149 } 2150 mutex_unlock(&dentry->d_inode->i_mutex); 2151 if (!error) { 2152 d_delete(dentry); 2153 } 2154 dput(dentry); 2155 2156 return error; 2157 } 2158 2159 static long do_rmdir(int dfd, const char __user *pathname) 2160 { 2161 int error = 0; 2162 char * name; 2163 struct dentry *dentry; 2164 struct nameidata nd; 2165 2166 error = user_path_parent(dfd, pathname, &nd, &name); 2167 if (error) 2168 return error; 2169 2170 switch(nd.last_type) { 2171 case LAST_DOTDOT: 2172 error = -ENOTEMPTY; 2173 goto exit1; 2174 case LAST_DOT: 2175 error = -EINVAL; 2176 goto exit1; 2177 case LAST_ROOT: 2178 error = -EBUSY; 2179 goto exit1; 2180 } 2181 2182 nd.flags &= ~LOOKUP_PARENT; 2183 2184 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2185 dentry = lookup_hash(&nd); 2186 error = PTR_ERR(dentry); 2187 if (IS_ERR(dentry)) 2188 goto exit2; 2189 error = mnt_want_write(nd.path.mnt); 2190 if (error) 2191 goto exit3; 2192 error = security_path_rmdir(&nd.path, dentry); 2193 if (error) 2194 goto exit4; 2195 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2196 exit4: 2197 mnt_drop_write(nd.path.mnt); 2198 exit3: 2199 dput(dentry); 2200 exit2: 2201 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2202 exit1: 2203 path_put(&nd.path); 2204 putname(name); 2205 return error; 2206 } 2207 2208 SYSCALL_DEFINE1(rmdir, const char __user *, pathname) 2209 { 2210 return do_rmdir(AT_FDCWD, pathname); 2211 } 2212 2213 int vfs_unlink(struct inode *dir, struct dentry *dentry) 2214 { 2215 int error = may_delete(dir, dentry, 0); 2216 2217 if (error) 2218 return error; 2219 2220 if (!dir->i_op->unlink) 2221 return -EPERM; 2222 2223 vfs_dq_init(dir); 2224 2225 mutex_lock(&dentry->d_inode->i_mutex); 2226 if (d_mountpoint(dentry)) 2227 error = -EBUSY; 2228 else { 2229 error = security_inode_unlink(dir, dentry); 2230 if (!error) 2231 error = dir->i_op->unlink(dir, dentry); 2232 } 2233 mutex_unlock(&dentry->d_inode->i_mutex); 2234 2235 /* We don't d_delete() NFS sillyrenamed files--they still exist. */ 2236 if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { 2237 fsnotify_link_count(dentry->d_inode); 2238 d_delete(dentry); 2239 } 2240 2241 return error; 2242 } 2243 2244 /* 2245 * Make sure that the actual truncation of the file will occur outside its 2246 * directory's i_mutex. Truncate can take a long time if there is a lot of 2247 * writeout happening, and we don't want to prevent access to the directory 2248 * while waiting on the I/O. 2249 */ 2250 static long do_unlinkat(int dfd, const char __user *pathname) 2251 { 2252 int error; 2253 char *name; 2254 struct dentry *dentry; 2255 struct nameidata nd; 2256 struct inode *inode = NULL; 2257 2258 error = user_path_parent(dfd, pathname, &nd, &name); 2259 if (error) 2260 return error; 2261 2262 error = -EISDIR; 2263 if (nd.last_type != LAST_NORM) 2264 goto exit1; 2265 2266 nd.flags &= ~LOOKUP_PARENT; 2267 2268 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2269 dentry = lookup_hash(&nd); 2270 error = PTR_ERR(dentry); 2271 if (!IS_ERR(dentry)) { 2272 /* Why not before? Because we want correct error value */ 2273 if (nd.last.name[nd.last.len]) 2274 goto slashes; 2275 inode = dentry->d_inode; 2276 if (inode) 2277 atomic_inc(&inode->i_count); 2278 error = mnt_want_write(nd.path.mnt); 2279 if (error) 2280 goto exit2; 2281 error = security_path_unlink(&nd.path, dentry); 2282 if (error) 2283 goto exit3; 2284 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2285 exit3: 2286 mnt_drop_write(nd.path.mnt); 2287 exit2: 2288 dput(dentry); 2289 } 2290 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2291 if (inode) 2292 iput(inode); /* truncate the inode here */ 2293 exit1: 2294 path_put(&nd.path); 2295 putname(name); 2296 return error; 2297 2298 slashes: 2299 error = !dentry->d_inode ? -ENOENT : 2300 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; 2301 goto exit2; 2302 } 2303 2304 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) 2305 { 2306 if ((flag & ~AT_REMOVEDIR) != 0) 2307 return -EINVAL; 2308 2309 if (flag & AT_REMOVEDIR) 2310 return do_rmdir(dfd, pathname); 2311 2312 return do_unlinkat(dfd, pathname); 2313 } 2314 2315 SYSCALL_DEFINE1(unlink, const char __user *, pathname) 2316 { 2317 return do_unlinkat(AT_FDCWD, pathname); 2318 } 2319 2320 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) 2321 { 2322 int error = may_create(dir, dentry); 2323 2324 if (error) 2325 return error; 2326 2327 if (!dir->i_op->symlink) 2328 return -EPERM; 2329 2330 error = security_inode_symlink(dir, dentry, oldname); 2331 if (error) 2332 return error; 2333 2334 vfs_dq_init(dir); 2335 error = dir->i_op->symlink(dir, dentry, oldname); 2336 if (!error) 2337 fsnotify_create(dir, dentry); 2338 return error; 2339 } 2340 2341 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, 2342 int, newdfd, const char __user *, newname) 2343 { 2344 int error; 2345 char *from; 2346 char *to; 2347 struct dentry *dentry; 2348 struct nameidata nd; 2349 2350 from = getname(oldname); 2351 if (IS_ERR(from)) 2352 return PTR_ERR(from); 2353 2354 error = user_path_parent(newdfd, newname, &nd, &to); 2355 if (error) 2356 goto out_putname; 2357 2358 dentry = lookup_create(&nd, 0); 2359 error = PTR_ERR(dentry); 2360 if (IS_ERR(dentry)) 2361 goto out_unlock; 2362 2363 error = mnt_want_write(nd.path.mnt); 2364 if (error) 2365 goto out_dput; 2366 error = security_path_symlink(&nd.path, dentry, from); 2367 if (error) 2368 goto out_drop_write; 2369 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); 2370 out_drop_write: 2371 mnt_drop_write(nd.path.mnt); 2372 out_dput: 2373 dput(dentry); 2374 out_unlock: 2375 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2376 path_put(&nd.path); 2377 putname(to); 2378 out_putname: 2379 putname(from); 2380 return error; 2381 } 2382 2383 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) 2384 { 2385 return sys_symlinkat(oldname, AT_FDCWD, newname); 2386 } 2387 2388 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) 2389 { 2390 struct inode *inode = old_dentry->d_inode; 2391 int error; 2392 2393 if (!inode) 2394 return -ENOENT; 2395 2396 error = may_create(dir, new_dentry); 2397 if (error) 2398 return error; 2399 2400 if (dir->i_sb != inode->i_sb) 2401 return -EXDEV; 2402 2403 /* 2404 * A link to an append-only or immutable file cannot be created. 2405 */ 2406 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2407 return -EPERM; 2408 if (!dir->i_op->link) 2409 return -EPERM; 2410 if (S_ISDIR(inode->i_mode)) 2411 return -EPERM; 2412 2413 error = security_inode_link(old_dentry, dir, new_dentry); 2414 if (error) 2415 return error; 2416 2417 mutex_lock(&inode->i_mutex); 2418 vfs_dq_init(dir); 2419 error = dir->i_op->link(old_dentry, dir, new_dentry); 2420 mutex_unlock(&inode->i_mutex); 2421 if (!error) 2422 fsnotify_link(dir, inode, new_dentry); 2423 return error; 2424 } 2425 2426 /* 2427 * Hardlinks are often used in delicate situations. We avoid 2428 * security-related surprises by not following symlinks on the 2429 * newname. --KAB 2430 * 2431 * We don't follow them on the oldname either to be compatible 2432 * with linux 2.0, and to avoid hard-linking to directories 2433 * and other special files. --ADM 2434 */ 2435 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, 2436 int, newdfd, const char __user *, newname, int, flags) 2437 { 2438 struct dentry *new_dentry; 2439 struct nameidata nd; 2440 struct path old_path; 2441 int error; 2442 char *to; 2443 2444 if ((flags & ~AT_SYMLINK_FOLLOW) != 0) 2445 return -EINVAL; 2446 2447 error = user_path_at(olddfd, oldname, 2448 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, 2449 &old_path); 2450 if (error) 2451 return error; 2452 2453 error = user_path_parent(newdfd, newname, &nd, &to); 2454 if (error) 2455 goto out; 2456 error = -EXDEV; 2457 if (old_path.mnt != nd.path.mnt) 2458 goto out_release; 2459 new_dentry = lookup_create(&nd, 0); 2460 error = PTR_ERR(new_dentry); 2461 if (IS_ERR(new_dentry)) 2462 goto out_unlock; 2463 error = mnt_want_write(nd.path.mnt); 2464 if (error) 2465 goto out_dput; 2466 error = security_path_link(old_path.dentry, &nd.path, new_dentry); 2467 if (error) 2468 goto out_drop_write; 2469 error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); 2470 out_drop_write: 2471 mnt_drop_write(nd.path.mnt); 2472 out_dput: 2473 dput(new_dentry); 2474 out_unlock: 2475 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2476 out_release: 2477 path_put(&nd.path); 2478 putname(to); 2479 out: 2480 path_put(&old_path); 2481 2482 return error; 2483 } 2484 2485 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) 2486 { 2487 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 2488 } 2489 2490 /* 2491 * The worst of all namespace operations - renaming directory. "Perverted" 2492 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 2493 * Problems: 2494 * a) we can get into loop creation. Check is done in is_subdir(). 2495 * b) race potential - two innocent renames can create a loop together. 2496 * That's where 4.4 screws up. Current fix: serialization on 2497 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 2498 * story. 2499 * c) we have to lock _three_ objects - parents and victim (if it exists). 2500 * And that - after we got ->i_mutex on parents (until then we don't know 2501 * whether the target exists). Solution: try to be smart with locking 2502 * order for inodes. We rely on the fact that tree topology may change 2503 * only under ->s_vfs_rename_mutex _and_ that parent of the object we 2504 * move will be locked. Thus we can rank directories by the tree 2505 * (ancestors first) and rank all non-directories after them. 2506 * That works since everybody except rename does "lock parent, lookup, 2507 * lock child" and rename is under ->s_vfs_rename_mutex. 2508 * HOWEVER, it relies on the assumption that any object with ->lookup() 2509 * has no more than 1 dentry. If "hybrid" objects will ever appear, 2510 * we'd better make sure that there's no link(2) for them. 2511 * d) some filesystems don't support opened-but-unlinked directories, 2512 * either because of layout or because they are not ready to deal with 2513 * all cases correctly. The latter will be fixed (taking this sort of 2514 * stuff into VFS), but the former is not going away. Solution: the same 2515 * trick as in rmdir(). 2516 * e) conversion from fhandle to dentry may come in the wrong moment - when 2517 * we are removing the target. Solution: we will have to grab ->i_mutex 2518 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2519 * ->i_mutex on parents, which works but leads to some truely excessive 2520 * locking]. 2521 */ 2522 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 2523 struct inode *new_dir, struct dentry *new_dentry) 2524 { 2525 int error = 0; 2526 struct inode *target; 2527 2528 /* 2529 * If we are going to change the parent - check write permissions, 2530 * we'll need to flip '..'. 2531 */ 2532 if (new_dir != old_dir) { 2533 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 2534 if (error) 2535 return error; 2536 } 2537 2538 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 2539 if (error) 2540 return error; 2541 2542 target = new_dentry->d_inode; 2543 if (target) { 2544 mutex_lock(&target->i_mutex); 2545 dentry_unhash(new_dentry); 2546 } 2547 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2548 error = -EBUSY; 2549 else 2550 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2551 if (target) { 2552 if (!error) 2553 target->i_flags |= S_DEAD; 2554 mutex_unlock(&target->i_mutex); 2555 if (d_unhashed(new_dentry)) 2556 d_rehash(new_dentry); 2557 dput(new_dentry); 2558 } 2559 if (!error) 2560 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2561 d_move(old_dentry,new_dentry); 2562 return error; 2563 } 2564 2565 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 2566 struct inode *new_dir, struct dentry *new_dentry) 2567 { 2568 struct inode *target; 2569 int error; 2570 2571 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 2572 if (error) 2573 return error; 2574 2575 dget(new_dentry); 2576 target = new_dentry->d_inode; 2577 if (target) 2578 mutex_lock(&target->i_mutex); 2579 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2580 error = -EBUSY; 2581 else 2582 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2583 if (!error) { 2584 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2585 d_move(old_dentry, new_dentry); 2586 } 2587 if (target) 2588 mutex_unlock(&target->i_mutex); 2589 dput(new_dentry); 2590 return error; 2591 } 2592 2593 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 2594 struct inode *new_dir, struct dentry *new_dentry) 2595 { 2596 int error; 2597 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 2598 const char *old_name; 2599 2600 if (old_dentry->d_inode == new_dentry->d_inode) 2601 return 0; 2602 2603 error = may_delete(old_dir, old_dentry, is_dir); 2604 if (error) 2605 return error; 2606 2607 if (!new_dentry->d_inode) 2608 error = may_create(new_dir, new_dentry); 2609 else 2610 error = may_delete(new_dir, new_dentry, is_dir); 2611 if (error) 2612 return error; 2613 2614 if (!old_dir->i_op->rename) 2615 return -EPERM; 2616 2617 vfs_dq_init(old_dir); 2618 vfs_dq_init(new_dir); 2619 2620 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 2621 2622 if (is_dir) 2623 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 2624 else 2625 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 2626 if (!error) { 2627 const char *new_name = old_dentry->d_name.name; 2628 fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir, 2629 new_dentry->d_inode, old_dentry); 2630 } 2631 fsnotify_oldname_free(old_name); 2632 2633 return error; 2634 } 2635 2636 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 2637 int, newdfd, const char __user *, newname) 2638 { 2639 struct dentry *old_dir, *new_dir; 2640 struct dentry *old_dentry, *new_dentry; 2641 struct dentry *trap; 2642 struct nameidata oldnd, newnd; 2643 char *from; 2644 char *to; 2645 int error; 2646 2647 error = user_path_parent(olddfd, oldname, &oldnd, &from); 2648 if (error) 2649 goto exit; 2650 2651 error = user_path_parent(newdfd, newname, &newnd, &to); 2652 if (error) 2653 goto exit1; 2654 2655 error = -EXDEV; 2656 if (oldnd.path.mnt != newnd.path.mnt) 2657 goto exit2; 2658 2659 old_dir = oldnd.path.dentry; 2660 error = -EBUSY; 2661 if (oldnd.last_type != LAST_NORM) 2662 goto exit2; 2663 2664 new_dir = newnd.path.dentry; 2665 if (newnd.last_type != LAST_NORM) 2666 goto exit2; 2667 2668 oldnd.flags &= ~LOOKUP_PARENT; 2669 newnd.flags &= ~LOOKUP_PARENT; 2670 newnd.flags |= LOOKUP_RENAME_TARGET; 2671 2672 trap = lock_rename(new_dir, old_dir); 2673 2674 old_dentry = lookup_hash(&oldnd); 2675 error = PTR_ERR(old_dentry); 2676 if (IS_ERR(old_dentry)) 2677 goto exit3; 2678 /* source must exist */ 2679 error = -ENOENT; 2680 if (!old_dentry->d_inode) 2681 goto exit4; 2682 /* unless the source is a directory trailing slashes give -ENOTDIR */ 2683 if (!S_ISDIR(old_dentry->d_inode->i_mode)) { 2684 error = -ENOTDIR; 2685 if (oldnd.last.name[oldnd.last.len]) 2686 goto exit4; 2687 if (newnd.last.name[newnd.last.len]) 2688 goto exit4; 2689 } 2690 /* source should not be ancestor of target */ 2691 error = -EINVAL; 2692 if (old_dentry == trap) 2693 goto exit4; 2694 new_dentry = lookup_hash(&newnd); 2695 error = PTR_ERR(new_dentry); 2696 if (IS_ERR(new_dentry)) 2697 goto exit4; 2698 /* target should not be an ancestor of source */ 2699 error = -ENOTEMPTY; 2700 if (new_dentry == trap) 2701 goto exit5; 2702 2703 error = mnt_want_write(oldnd.path.mnt); 2704 if (error) 2705 goto exit5; 2706 error = security_path_rename(&oldnd.path, old_dentry, 2707 &newnd.path, new_dentry); 2708 if (error) 2709 goto exit6; 2710 error = vfs_rename(old_dir->d_inode, old_dentry, 2711 new_dir->d_inode, new_dentry); 2712 exit6: 2713 mnt_drop_write(oldnd.path.mnt); 2714 exit5: 2715 dput(new_dentry); 2716 exit4: 2717 dput(old_dentry); 2718 exit3: 2719 unlock_rename(new_dir, old_dir); 2720 exit2: 2721 path_put(&newnd.path); 2722 putname(to); 2723 exit1: 2724 path_put(&oldnd.path); 2725 putname(from); 2726 exit: 2727 return error; 2728 } 2729 2730 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 2731 { 2732 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 2733 } 2734 2735 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 2736 { 2737 int len; 2738 2739 len = PTR_ERR(link); 2740 if (IS_ERR(link)) 2741 goto out; 2742 2743 len = strlen(link); 2744 if (len > (unsigned) buflen) 2745 len = buflen; 2746 if (copy_to_user(buffer, link, len)) 2747 len = -EFAULT; 2748 out: 2749 return len; 2750 } 2751 2752 /* 2753 * A helper for ->readlink(). This should be used *ONLY* for symlinks that 2754 * have ->follow_link() touching nd only in nd_set_link(). Using (or not 2755 * using) it for any given inode is up to filesystem. 2756 */ 2757 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) 2758 { 2759 struct nameidata nd; 2760 void *cookie; 2761 int res; 2762 2763 nd.depth = 0; 2764 cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); 2765 if (IS_ERR(cookie)) 2766 return PTR_ERR(cookie); 2767 2768 res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); 2769 if (dentry->d_inode->i_op->put_link) 2770 dentry->d_inode->i_op->put_link(dentry, &nd, cookie); 2771 return res; 2772 } 2773 2774 int vfs_follow_link(struct nameidata *nd, const char *link) 2775 { 2776 return __vfs_follow_link(nd, link); 2777 } 2778 2779 /* get the link contents into pagecache */ 2780 static char *page_getlink(struct dentry * dentry, struct page **ppage) 2781 { 2782 char *kaddr; 2783 struct page *page; 2784 struct address_space *mapping = dentry->d_inode->i_mapping; 2785 page = read_mapping_page(mapping, 0, NULL); 2786 if (IS_ERR(page)) 2787 return (char*)page; 2788 *ppage = page; 2789 kaddr = kmap(page); 2790 nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); 2791 return kaddr; 2792 } 2793 2794 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 2795 { 2796 struct page *page = NULL; 2797 char *s = page_getlink(dentry, &page); 2798 int res = vfs_readlink(dentry,buffer,buflen,s); 2799 if (page) { 2800 kunmap(page); 2801 page_cache_release(page); 2802 } 2803 return res; 2804 } 2805 2806 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) 2807 { 2808 struct page *page = NULL; 2809 nd_set_link(nd, page_getlink(dentry, &page)); 2810 return page; 2811 } 2812 2813 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 2814 { 2815 struct page *page = cookie; 2816 2817 if (page) { 2818 kunmap(page); 2819 page_cache_release(page); 2820 } 2821 } 2822 2823 /* 2824 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS 2825 */ 2826 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) 2827 { 2828 struct address_space *mapping = inode->i_mapping; 2829 struct page *page; 2830 void *fsdata; 2831 int err; 2832 char *kaddr; 2833 unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; 2834 if (nofs) 2835 flags |= AOP_FLAG_NOFS; 2836 2837 retry: 2838 err = pagecache_write_begin(NULL, mapping, 0, len-1, 2839 flags, &page, &fsdata); 2840 if (err) 2841 goto fail; 2842 2843 kaddr = kmap_atomic(page, KM_USER0); 2844 memcpy(kaddr, symname, len-1); 2845 kunmap_atomic(kaddr, KM_USER0); 2846 2847 err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, 2848 page, fsdata); 2849 if (err < 0) 2850 goto fail; 2851 if (err < len-1) 2852 goto retry; 2853 2854 mark_inode_dirty(inode); 2855 return 0; 2856 fail: 2857 return err; 2858 } 2859 2860 int page_symlink(struct inode *inode, const char *symname, int len) 2861 { 2862 return __page_symlink(inode, symname, len, 2863 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); 2864 } 2865 2866 const struct inode_operations page_symlink_inode_operations = { 2867 .readlink = generic_readlink, 2868 .follow_link = page_follow_link_light, 2869 .put_link = page_put_link, 2870 }; 2871 2872 EXPORT_SYMBOL(user_path_at); 2873 EXPORT_SYMBOL(follow_down); 2874 EXPORT_SYMBOL(follow_up); 2875 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 2876 EXPORT_SYMBOL(getname); 2877 EXPORT_SYMBOL(lock_rename); 2878 EXPORT_SYMBOL(lookup_one_len); 2879 EXPORT_SYMBOL(page_follow_link_light); 2880 EXPORT_SYMBOL(page_put_link); 2881 EXPORT_SYMBOL(page_readlink); 2882 EXPORT_SYMBOL(__page_symlink); 2883 EXPORT_SYMBOL(page_symlink); 2884 EXPORT_SYMBOL(page_symlink_inode_operations); 2885 EXPORT_SYMBOL(path_lookup); 2886 EXPORT_SYMBOL(kern_path); 2887 EXPORT_SYMBOL(vfs_path_lookup); 2888 EXPORT_SYMBOL(inode_permission); 2889 EXPORT_SYMBOL(file_permission); 2890 EXPORT_SYMBOL(unlock_rename); 2891 EXPORT_SYMBOL(vfs_create); 2892 EXPORT_SYMBOL(vfs_follow_link); 2893 EXPORT_SYMBOL(vfs_link); 2894 EXPORT_SYMBOL(vfs_mkdir); 2895 EXPORT_SYMBOL(vfs_mknod); 2896 EXPORT_SYMBOL(generic_permission); 2897 EXPORT_SYMBOL(vfs_readlink); 2898 EXPORT_SYMBOL(vfs_rename); 2899 EXPORT_SYMBOL(vfs_rmdir); 2900 EXPORT_SYMBOL(vfs_symlink); 2901 EXPORT_SYMBOL(vfs_unlink); 2902 EXPORT_SYMBOL(dentry_unhash); 2903 EXPORT_SYMBOL(generic_readlink); 2904