1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/slab.h> 13 #include <linux/sched.h> 14 #include <linux/smp_lock.h> 15 #include <linux/init.h> 16 #include <linux/kernel.h> 17 #include <linux/quotaops.h> 18 #include <linux/acct.h> 19 #include <linux/capability.h> 20 #include <linux/module.h> 21 #include <linux/sysfs.h> 22 #include <linux/seq_file.h> 23 #include <linux/mnt_namespace.h> 24 #include <linux/namei.h> 25 #include <linux/security.h> 26 #include <linux/mount.h> 27 #include <linux/ramfs.h> 28 #include <linux/log2.h> 29 #include <asm/uaccess.h> 30 #include <asm/unistd.h> 31 #include "pnode.h" 32 #include "internal.h" 33 34 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 35 #define HASH_SIZE (1UL << HASH_SHIFT) 36 37 /* spinlock for vfsmount related operations, inplace of dcache_lock */ 38 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 39 40 static int event; 41 42 static struct list_head *mount_hashtable __read_mostly; 43 static struct kmem_cache *mnt_cache __read_mostly; 44 static struct rw_semaphore namespace_sem; 45 46 /* /sys/fs */ 47 struct kobject *fs_kobj; 48 EXPORT_SYMBOL_GPL(fs_kobj); 49 50 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 51 { 52 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 53 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 54 tmp = tmp + (tmp >> HASH_SHIFT); 55 return tmp & (HASH_SIZE - 1); 56 } 57 58 struct vfsmount *alloc_vfsmnt(const char *name) 59 { 60 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 61 if (mnt) { 62 atomic_set(&mnt->mnt_count, 1); 63 INIT_LIST_HEAD(&mnt->mnt_hash); 64 INIT_LIST_HEAD(&mnt->mnt_child); 65 INIT_LIST_HEAD(&mnt->mnt_mounts); 66 INIT_LIST_HEAD(&mnt->mnt_list); 67 INIT_LIST_HEAD(&mnt->mnt_expire); 68 INIT_LIST_HEAD(&mnt->mnt_share); 69 INIT_LIST_HEAD(&mnt->mnt_slave_list); 70 INIT_LIST_HEAD(&mnt->mnt_slave); 71 if (name) { 72 int size = strlen(name) + 1; 73 char *newname = kmalloc(size, GFP_KERNEL); 74 if (newname) { 75 memcpy(newname, name, size); 76 mnt->mnt_devname = newname; 77 } 78 } 79 } 80 return mnt; 81 } 82 83 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 84 { 85 mnt->mnt_sb = sb; 86 mnt->mnt_root = dget(sb->s_root); 87 return 0; 88 } 89 90 EXPORT_SYMBOL(simple_set_mnt); 91 92 void free_vfsmnt(struct vfsmount *mnt) 93 { 94 kfree(mnt->mnt_devname); 95 kmem_cache_free(mnt_cache, mnt); 96 } 97 98 /* 99 * find the first or last mount at @dentry on vfsmount @mnt depending on 100 * @dir. If @dir is set return the first mount else return the last mount. 101 */ 102 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 103 int dir) 104 { 105 struct list_head *head = mount_hashtable + hash(mnt, dentry); 106 struct list_head *tmp = head; 107 struct vfsmount *p, *found = NULL; 108 109 for (;;) { 110 tmp = dir ? tmp->next : tmp->prev; 111 p = NULL; 112 if (tmp == head) 113 break; 114 p = list_entry(tmp, struct vfsmount, mnt_hash); 115 if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { 116 found = p; 117 break; 118 } 119 } 120 return found; 121 } 122 123 /* 124 * lookup_mnt increments the ref count before returning 125 * the vfsmount struct. 126 */ 127 struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 128 { 129 struct vfsmount *child_mnt; 130 spin_lock(&vfsmount_lock); 131 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 132 mntget(child_mnt); 133 spin_unlock(&vfsmount_lock); 134 return child_mnt; 135 } 136 137 static inline int check_mnt(struct vfsmount *mnt) 138 { 139 return mnt->mnt_ns == current->nsproxy->mnt_ns; 140 } 141 142 static void touch_mnt_namespace(struct mnt_namespace *ns) 143 { 144 if (ns) { 145 ns->event = ++event; 146 wake_up_interruptible(&ns->poll); 147 } 148 } 149 150 static void __touch_mnt_namespace(struct mnt_namespace *ns) 151 { 152 if (ns && ns->event != event) { 153 ns->event = event; 154 wake_up_interruptible(&ns->poll); 155 } 156 } 157 158 static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 159 { 160 old_path->dentry = mnt->mnt_mountpoint; 161 old_path->mnt = mnt->mnt_parent; 162 mnt->mnt_parent = mnt; 163 mnt->mnt_mountpoint = mnt->mnt_root; 164 list_del_init(&mnt->mnt_child); 165 list_del_init(&mnt->mnt_hash); 166 old_path->dentry->d_mounted--; 167 } 168 169 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 170 struct vfsmount *child_mnt) 171 { 172 child_mnt->mnt_parent = mntget(mnt); 173 child_mnt->mnt_mountpoint = dget(dentry); 174 dentry->d_mounted++; 175 } 176 177 static void attach_mnt(struct vfsmount *mnt, struct path *path) 178 { 179 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 180 list_add_tail(&mnt->mnt_hash, mount_hashtable + 181 hash(path->mnt, path->dentry)); 182 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 183 } 184 185 /* 186 * the caller must hold vfsmount_lock 187 */ 188 static void commit_tree(struct vfsmount *mnt) 189 { 190 struct vfsmount *parent = mnt->mnt_parent; 191 struct vfsmount *m; 192 LIST_HEAD(head); 193 struct mnt_namespace *n = parent->mnt_ns; 194 195 BUG_ON(parent == mnt); 196 197 list_add_tail(&head, &mnt->mnt_list); 198 list_for_each_entry(m, &head, mnt_list) 199 m->mnt_ns = n; 200 list_splice(&head, n->list.prev); 201 202 list_add_tail(&mnt->mnt_hash, mount_hashtable + 203 hash(parent, mnt->mnt_mountpoint)); 204 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 205 touch_mnt_namespace(n); 206 } 207 208 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) 209 { 210 struct list_head *next = p->mnt_mounts.next; 211 if (next == &p->mnt_mounts) { 212 while (1) { 213 if (p == root) 214 return NULL; 215 next = p->mnt_child.next; 216 if (next != &p->mnt_parent->mnt_mounts) 217 break; 218 p = p->mnt_parent; 219 } 220 } 221 return list_entry(next, struct vfsmount, mnt_child); 222 } 223 224 static struct vfsmount *skip_mnt_tree(struct vfsmount *p) 225 { 226 struct list_head *prev = p->mnt_mounts.prev; 227 while (prev != &p->mnt_mounts) { 228 p = list_entry(prev, struct vfsmount, mnt_child); 229 prev = p->mnt_mounts.prev; 230 } 231 return p; 232 } 233 234 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 235 int flag) 236 { 237 struct super_block *sb = old->mnt_sb; 238 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 239 240 if (mnt) { 241 mnt->mnt_flags = old->mnt_flags; 242 atomic_inc(&sb->s_active); 243 mnt->mnt_sb = sb; 244 mnt->mnt_root = dget(root); 245 mnt->mnt_mountpoint = mnt->mnt_root; 246 mnt->mnt_parent = mnt; 247 248 if (flag & CL_SLAVE) { 249 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 250 mnt->mnt_master = old; 251 CLEAR_MNT_SHARED(mnt); 252 } else if (!(flag & CL_PRIVATE)) { 253 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 254 list_add(&mnt->mnt_share, &old->mnt_share); 255 if (IS_MNT_SLAVE(old)) 256 list_add(&mnt->mnt_slave, &old->mnt_slave); 257 mnt->mnt_master = old->mnt_master; 258 } 259 if (flag & CL_MAKE_SHARED) 260 set_mnt_shared(mnt); 261 262 /* stick the duplicate mount on the same expiry list 263 * as the original if that was on one */ 264 if (flag & CL_EXPIRE) { 265 if (!list_empty(&old->mnt_expire)) 266 list_add(&mnt->mnt_expire, &old->mnt_expire); 267 } 268 } 269 return mnt; 270 } 271 272 static inline void __mntput(struct vfsmount *mnt) 273 { 274 struct super_block *sb = mnt->mnt_sb; 275 dput(mnt->mnt_root); 276 free_vfsmnt(mnt); 277 deactivate_super(sb); 278 } 279 280 void mntput_no_expire(struct vfsmount *mnt) 281 { 282 repeat: 283 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 284 if (likely(!mnt->mnt_pinned)) { 285 spin_unlock(&vfsmount_lock); 286 __mntput(mnt); 287 return; 288 } 289 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); 290 mnt->mnt_pinned = 0; 291 spin_unlock(&vfsmount_lock); 292 acct_auto_close_mnt(mnt); 293 security_sb_umount_close(mnt); 294 goto repeat; 295 } 296 } 297 298 EXPORT_SYMBOL(mntput_no_expire); 299 300 void mnt_pin(struct vfsmount *mnt) 301 { 302 spin_lock(&vfsmount_lock); 303 mnt->mnt_pinned++; 304 spin_unlock(&vfsmount_lock); 305 } 306 307 EXPORT_SYMBOL(mnt_pin); 308 309 void mnt_unpin(struct vfsmount *mnt) 310 { 311 spin_lock(&vfsmount_lock); 312 if (mnt->mnt_pinned) { 313 atomic_inc(&mnt->mnt_count); 314 mnt->mnt_pinned--; 315 } 316 spin_unlock(&vfsmount_lock); 317 } 318 319 EXPORT_SYMBOL(mnt_unpin); 320 321 static inline void mangle(struct seq_file *m, const char *s) 322 { 323 seq_escape(m, s, " \t\n\\"); 324 } 325 326 /* 327 * Simple .show_options callback for filesystems which don't want to 328 * implement more complex mount option showing. 329 * 330 * See also save_mount_options(). 331 */ 332 int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 333 { 334 const char *options = mnt->mnt_sb->s_options; 335 336 if (options != NULL && options[0]) { 337 seq_putc(m, ','); 338 mangle(m, options); 339 } 340 341 return 0; 342 } 343 EXPORT_SYMBOL(generic_show_options); 344 345 /* 346 * If filesystem uses generic_show_options(), this function should be 347 * called from the fill_super() callback. 348 * 349 * The .remount_fs callback usually needs to be handled in a special 350 * way, to make sure, that previous options are not overwritten if the 351 * remount fails. 352 * 353 * Also note, that if the filesystem's .remount_fs function doesn't 354 * reset all options to their default value, but changes only newly 355 * given options, then the displayed options will not reflect reality 356 * any more. 357 */ 358 void save_mount_options(struct super_block *sb, char *options) 359 { 360 kfree(sb->s_options); 361 sb->s_options = kstrdup(options, GFP_KERNEL); 362 } 363 EXPORT_SYMBOL(save_mount_options); 364 365 /* iterator */ 366 static void *m_start(struct seq_file *m, loff_t *pos) 367 { 368 struct mnt_namespace *n = m->private; 369 370 down_read(&namespace_sem); 371 return seq_list_start(&n->list, *pos); 372 } 373 374 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 375 { 376 struct mnt_namespace *n = m->private; 377 378 return seq_list_next(v, &n->list, pos); 379 } 380 381 static void m_stop(struct seq_file *m, void *v) 382 { 383 up_read(&namespace_sem); 384 } 385 386 static int show_vfsmnt(struct seq_file *m, void *v) 387 { 388 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 389 int err = 0; 390 static struct proc_fs_info { 391 int flag; 392 char *str; 393 } fs_info[] = { 394 { MS_SYNCHRONOUS, ",sync" }, 395 { MS_DIRSYNC, ",dirsync" }, 396 { MS_MANDLOCK, ",mand" }, 397 { 0, NULL } 398 }; 399 static struct proc_fs_info mnt_info[] = { 400 { MNT_NOSUID, ",nosuid" }, 401 { MNT_NODEV, ",nodev" }, 402 { MNT_NOEXEC, ",noexec" }, 403 { MNT_NOATIME, ",noatime" }, 404 { MNT_NODIRATIME, ",nodiratime" }, 405 { MNT_RELATIME, ",relatime" }, 406 { 0, NULL } 407 }; 408 struct proc_fs_info *fs_infop; 409 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 410 411 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 412 seq_putc(m, ' '); 413 seq_path(m, &mnt_path, " \t\n\\"); 414 seq_putc(m, ' '); 415 mangle(m, mnt->mnt_sb->s_type->name); 416 if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { 417 seq_putc(m, '.'); 418 mangle(m, mnt->mnt_sb->s_subtype); 419 } 420 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); 421 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 422 if (mnt->mnt_sb->s_flags & fs_infop->flag) 423 seq_puts(m, fs_infop->str); 424 } 425 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { 426 if (mnt->mnt_flags & fs_infop->flag) 427 seq_puts(m, fs_infop->str); 428 } 429 if (mnt->mnt_sb->s_op->show_options) 430 err = mnt->mnt_sb->s_op->show_options(m, mnt); 431 seq_puts(m, " 0 0\n"); 432 return err; 433 } 434 435 struct seq_operations mounts_op = { 436 .start = m_start, 437 .next = m_next, 438 .stop = m_stop, 439 .show = show_vfsmnt 440 }; 441 442 static int show_vfsstat(struct seq_file *m, void *v) 443 { 444 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 445 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 446 int err = 0; 447 448 /* device */ 449 if (mnt->mnt_devname) { 450 seq_puts(m, "device "); 451 mangle(m, mnt->mnt_devname); 452 } else 453 seq_puts(m, "no device"); 454 455 /* mount point */ 456 seq_puts(m, " mounted on "); 457 seq_path(m, &mnt_path, " \t\n\\"); 458 seq_putc(m, ' '); 459 460 /* file system type */ 461 seq_puts(m, "with fstype "); 462 mangle(m, mnt->mnt_sb->s_type->name); 463 464 /* optional statistics */ 465 if (mnt->mnt_sb->s_op->show_stats) { 466 seq_putc(m, ' '); 467 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 468 } 469 470 seq_putc(m, '\n'); 471 return err; 472 } 473 474 struct seq_operations mountstats_op = { 475 .start = m_start, 476 .next = m_next, 477 .stop = m_stop, 478 .show = show_vfsstat, 479 }; 480 481 /** 482 * may_umount_tree - check if a mount tree is busy 483 * @mnt: root of mount tree 484 * 485 * This is called to check if a tree of mounts has any 486 * open files, pwds, chroots or sub mounts that are 487 * busy. 488 */ 489 int may_umount_tree(struct vfsmount *mnt) 490 { 491 int actual_refs = 0; 492 int minimum_refs = 0; 493 struct vfsmount *p; 494 495 spin_lock(&vfsmount_lock); 496 for (p = mnt; p; p = next_mnt(p, mnt)) { 497 actual_refs += atomic_read(&p->mnt_count); 498 minimum_refs += 2; 499 } 500 spin_unlock(&vfsmount_lock); 501 502 if (actual_refs > minimum_refs) 503 return 0; 504 505 return 1; 506 } 507 508 EXPORT_SYMBOL(may_umount_tree); 509 510 /** 511 * may_umount - check if a mount point is busy 512 * @mnt: root of mount 513 * 514 * This is called to check if a mount point has any 515 * open files, pwds, chroots or sub mounts. If the 516 * mount has sub mounts this will return busy 517 * regardless of whether the sub mounts are busy. 518 * 519 * Doesn't take quota and stuff into account. IOW, in some cases it will 520 * give false negatives. The main reason why it's here is that we need 521 * a non-destructive way to look for easily umountable filesystems. 522 */ 523 int may_umount(struct vfsmount *mnt) 524 { 525 int ret = 1; 526 spin_lock(&vfsmount_lock); 527 if (propagate_mount_busy(mnt, 2)) 528 ret = 0; 529 spin_unlock(&vfsmount_lock); 530 return ret; 531 } 532 533 EXPORT_SYMBOL(may_umount); 534 535 void release_mounts(struct list_head *head) 536 { 537 struct vfsmount *mnt; 538 while (!list_empty(head)) { 539 mnt = list_first_entry(head, struct vfsmount, mnt_hash); 540 list_del_init(&mnt->mnt_hash); 541 if (mnt->mnt_parent != mnt) { 542 struct dentry *dentry; 543 struct vfsmount *m; 544 spin_lock(&vfsmount_lock); 545 dentry = mnt->mnt_mountpoint; 546 m = mnt->mnt_parent; 547 mnt->mnt_mountpoint = mnt->mnt_root; 548 mnt->mnt_parent = mnt; 549 m->mnt_ghosts--; 550 spin_unlock(&vfsmount_lock); 551 dput(dentry); 552 mntput(m); 553 } 554 mntput(mnt); 555 } 556 } 557 558 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 559 { 560 struct vfsmount *p; 561 562 for (p = mnt; p; p = next_mnt(p, mnt)) 563 list_move(&p->mnt_hash, kill); 564 565 if (propagate) 566 propagate_umount(kill); 567 568 list_for_each_entry(p, kill, mnt_hash) { 569 list_del_init(&p->mnt_expire); 570 list_del_init(&p->mnt_list); 571 __touch_mnt_namespace(p->mnt_ns); 572 p->mnt_ns = NULL; 573 list_del_init(&p->mnt_child); 574 if (p->mnt_parent != p) { 575 p->mnt_parent->mnt_ghosts++; 576 p->mnt_mountpoint->d_mounted--; 577 } 578 change_mnt_propagation(p, MS_PRIVATE); 579 } 580 } 581 582 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 583 584 static int do_umount(struct vfsmount *mnt, int flags) 585 { 586 struct super_block *sb = mnt->mnt_sb; 587 int retval; 588 LIST_HEAD(umount_list); 589 590 retval = security_sb_umount(mnt, flags); 591 if (retval) 592 return retval; 593 594 /* 595 * Allow userspace to request a mountpoint be expired rather than 596 * unmounting unconditionally. Unmount only happens if: 597 * (1) the mark is already set (the mark is cleared by mntput()) 598 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 599 */ 600 if (flags & MNT_EXPIRE) { 601 if (mnt == current->fs->root.mnt || 602 flags & (MNT_FORCE | MNT_DETACH)) 603 return -EINVAL; 604 605 if (atomic_read(&mnt->mnt_count) != 2) 606 return -EBUSY; 607 608 if (!xchg(&mnt->mnt_expiry_mark, 1)) 609 return -EAGAIN; 610 } 611 612 /* 613 * If we may have to abort operations to get out of this 614 * mount, and they will themselves hold resources we must 615 * allow the fs to do things. In the Unix tradition of 616 * 'Gee thats tricky lets do it in userspace' the umount_begin 617 * might fail to complete on the first run through as other tasks 618 * must return, and the like. Thats for the mount program to worry 619 * about for the moment. 620 */ 621 622 lock_kernel(); 623 if (sb->s_op->umount_begin) 624 sb->s_op->umount_begin(mnt, flags); 625 unlock_kernel(); 626 627 /* 628 * No sense to grab the lock for this test, but test itself looks 629 * somewhat bogus. Suggestions for better replacement? 630 * Ho-hum... In principle, we might treat that as umount + switch 631 * to rootfs. GC would eventually take care of the old vfsmount. 632 * Actually it makes sense, especially if rootfs would contain a 633 * /reboot - static binary that would close all descriptors and 634 * call reboot(9). Then init(8) could umount root and exec /reboot. 635 */ 636 if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 637 /* 638 * Special case for "unmounting" root ... 639 * we just try to remount it readonly. 640 */ 641 down_write(&sb->s_umount); 642 if (!(sb->s_flags & MS_RDONLY)) { 643 lock_kernel(); 644 DQUOT_OFF(sb); 645 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 646 unlock_kernel(); 647 } 648 up_write(&sb->s_umount); 649 return retval; 650 } 651 652 down_write(&namespace_sem); 653 spin_lock(&vfsmount_lock); 654 event++; 655 656 if (!(flags & MNT_DETACH)) 657 shrink_submounts(mnt, &umount_list); 658 659 retval = -EBUSY; 660 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 661 if (!list_empty(&mnt->mnt_list)) 662 umount_tree(mnt, 1, &umount_list); 663 retval = 0; 664 } 665 spin_unlock(&vfsmount_lock); 666 if (retval) 667 security_sb_umount_busy(mnt); 668 up_write(&namespace_sem); 669 release_mounts(&umount_list); 670 return retval; 671 } 672 673 /* 674 * Now umount can handle mount points as well as block devices. 675 * This is important for filesystems which use unnamed block devices. 676 * 677 * We now support a flag for forced unmount like the other 'big iron' 678 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 679 */ 680 681 asmlinkage long sys_umount(char __user * name, int flags) 682 { 683 struct nameidata nd; 684 int retval; 685 686 retval = __user_walk(name, LOOKUP_FOLLOW, &nd); 687 if (retval) 688 goto out; 689 retval = -EINVAL; 690 if (nd.path.dentry != nd.path.mnt->mnt_root) 691 goto dput_and_out; 692 if (!check_mnt(nd.path.mnt)) 693 goto dput_and_out; 694 695 retval = -EPERM; 696 if (!capable(CAP_SYS_ADMIN)) 697 goto dput_and_out; 698 699 retval = do_umount(nd.path.mnt, flags); 700 dput_and_out: 701 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 702 dput(nd.path.dentry); 703 mntput_no_expire(nd.path.mnt); 704 out: 705 return retval; 706 } 707 708 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 709 710 /* 711 * The 2.0 compatible umount. No flags. 712 */ 713 asmlinkage long sys_oldumount(char __user * name) 714 { 715 return sys_umount(name, 0); 716 } 717 718 #endif 719 720 static int mount_is_safe(struct nameidata *nd) 721 { 722 if (capable(CAP_SYS_ADMIN)) 723 return 0; 724 return -EPERM; 725 #ifdef notyet 726 if (S_ISLNK(nd->path.dentry->d_inode->i_mode)) 727 return -EPERM; 728 if (nd->path.dentry->d_inode->i_mode & S_ISVTX) { 729 if (current->uid != nd->path.dentry->d_inode->i_uid) 730 return -EPERM; 731 } 732 if (vfs_permission(nd, MAY_WRITE)) 733 return -EPERM; 734 return 0; 735 #endif 736 } 737 738 static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) 739 { 740 while (1) { 741 if (d == dentry) 742 return 1; 743 if (d == NULL || d == d->d_parent) 744 return 0; 745 d = d->d_parent; 746 } 747 } 748 749 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, 750 int flag) 751 { 752 struct vfsmount *res, *p, *q, *r, *s; 753 struct path path; 754 755 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 756 return NULL; 757 758 res = q = clone_mnt(mnt, dentry, flag); 759 if (!q) 760 goto Enomem; 761 q->mnt_mountpoint = mnt->mnt_mountpoint; 762 763 p = mnt; 764 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 765 if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry)) 766 continue; 767 768 for (s = r; s; s = next_mnt(s, r)) { 769 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 770 s = skip_mnt_tree(s); 771 continue; 772 } 773 while (p != s->mnt_parent) { 774 p = p->mnt_parent; 775 q = q->mnt_parent; 776 } 777 p = s; 778 path.mnt = q; 779 path.dentry = p->mnt_mountpoint; 780 q = clone_mnt(p, p->mnt_root, flag); 781 if (!q) 782 goto Enomem; 783 spin_lock(&vfsmount_lock); 784 list_add_tail(&q->mnt_list, &res->mnt_list); 785 attach_mnt(q, &path); 786 spin_unlock(&vfsmount_lock); 787 } 788 } 789 return res; 790 Enomem: 791 if (res) { 792 LIST_HEAD(umount_list); 793 spin_lock(&vfsmount_lock); 794 umount_tree(res, 0, &umount_list); 795 spin_unlock(&vfsmount_lock); 796 release_mounts(&umount_list); 797 } 798 return NULL; 799 } 800 801 struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 802 { 803 struct vfsmount *tree; 804 down_read(&namespace_sem); 805 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 806 up_read(&namespace_sem); 807 return tree; 808 } 809 810 void drop_collected_mounts(struct vfsmount *mnt) 811 { 812 LIST_HEAD(umount_list); 813 down_read(&namespace_sem); 814 spin_lock(&vfsmount_lock); 815 umount_tree(mnt, 0, &umount_list); 816 spin_unlock(&vfsmount_lock); 817 up_read(&namespace_sem); 818 release_mounts(&umount_list); 819 } 820 821 /* 822 * @source_mnt : mount tree to be attached 823 * @nd : place the mount tree @source_mnt is attached 824 * @parent_nd : if non-null, detach the source_mnt from its parent and 825 * store the parent mount and mountpoint dentry. 826 * (done when source_mnt is moved) 827 * 828 * NOTE: in the table below explains the semantics when a source mount 829 * of a given type is attached to a destination mount of a given type. 830 * --------------------------------------------------------------------------- 831 * | BIND MOUNT OPERATION | 832 * |************************************************************************** 833 * | source-->| shared | private | slave | unbindable | 834 * | dest | | | | | 835 * | | | | | | | 836 * | v | | | | | 837 * |************************************************************************** 838 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 839 * | | | | | | 840 * |non-shared| shared (+) | private | slave (*) | invalid | 841 * *************************************************************************** 842 * A bind operation clones the source mount and mounts the clone on the 843 * destination mount. 844 * 845 * (++) the cloned mount is propagated to all the mounts in the propagation 846 * tree of the destination mount and the cloned mount is added to 847 * the peer group of the source mount. 848 * (+) the cloned mount is created under the destination mount and is marked 849 * as shared. The cloned mount is added to the peer group of the source 850 * mount. 851 * (+++) the mount is propagated to all the mounts in the propagation tree 852 * of the destination mount and the cloned mount is made slave 853 * of the same master as that of the source mount. The cloned mount 854 * is marked as 'shared and slave'. 855 * (*) the cloned mount is made a slave of the same master as that of the 856 * source mount. 857 * 858 * --------------------------------------------------------------------------- 859 * | MOVE MOUNT OPERATION | 860 * |************************************************************************** 861 * | source-->| shared | private | slave | unbindable | 862 * | dest | | | | | 863 * | | | | | | | 864 * | v | | | | | 865 * |************************************************************************** 866 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 867 * | | | | | | 868 * |non-shared| shared (+*) | private | slave (*) | unbindable | 869 * *************************************************************************** 870 * 871 * (+) the mount is moved to the destination. And is then propagated to 872 * all the mounts in the propagation tree of the destination mount. 873 * (+*) the mount is moved to the destination. 874 * (+++) the mount is moved to the destination and is then propagated to 875 * all the mounts belonging to the destination mount's propagation tree. 876 * the mount is marked as 'shared and slave'. 877 * (*) the mount continues to be a slave at the new location. 878 * 879 * if the source mount is a tree, the operations explained above is 880 * applied to each mount in the tree. 881 * Must be called without spinlocks held, since this function can sleep 882 * in allocations. 883 */ 884 static int attach_recursive_mnt(struct vfsmount *source_mnt, 885 struct path *path, struct path *parent_path) 886 { 887 LIST_HEAD(tree_list); 888 struct vfsmount *dest_mnt = path->mnt; 889 struct dentry *dest_dentry = path->dentry; 890 struct vfsmount *child, *p; 891 892 if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) 893 return -EINVAL; 894 895 if (IS_MNT_SHARED(dest_mnt)) { 896 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 897 set_mnt_shared(p); 898 } 899 900 spin_lock(&vfsmount_lock); 901 if (parent_path) { 902 detach_mnt(source_mnt, parent_path); 903 attach_mnt(source_mnt, path); 904 touch_mnt_namespace(current->nsproxy->mnt_ns); 905 } else { 906 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 907 commit_tree(source_mnt); 908 } 909 910 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 911 list_del_init(&child->mnt_hash); 912 commit_tree(child); 913 } 914 spin_unlock(&vfsmount_lock); 915 return 0; 916 } 917 918 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) 919 { 920 int err; 921 if (mnt->mnt_sb->s_flags & MS_NOUSER) 922 return -EINVAL; 923 924 if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != 925 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 926 return -ENOTDIR; 927 928 err = -ENOENT; 929 mutex_lock(&nd->path.dentry->d_inode->i_mutex); 930 if (IS_DEADDIR(nd->path.dentry->d_inode)) 931 goto out_unlock; 932 933 err = security_sb_check_sb(mnt, nd); 934 if (err) 935 goto out_unlock; 936 937 err = -ENOENT; 938 if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry)) 939 err = attach_recursive_mnt(mnt, &nd->path, NULL); 940 out_unlock: 941 mutex_unlock(&nd->path.dentry->d_inode->i_mutex); 942 if (!err) 943 security_sb_post_addmount(mnt, nd); 944 return err; 945 } 946 947 /* 948 * recursively change the type of the mountpoint. 949 * noinline this do_mount helper to save do_mount stack space. 950 */ 951 static noinline int do_change_type(struct nameidata *nd, int flag) 952 { 953 struct vfsmount *m, *mnt = nd->path.mnt; 954 int recurse = flag & MS_REC; 955 int type = flag & ~MS_REC; 956 957 if (!capable(CAP_SYS_ADMIN)) 958 return -EPERM; 959 960 if (nd->path.dentry != nd->path.mnt->mnt_root) 961 return -EINVAL; 962 963 down_write(&namespace_sem); 964 spin_lock(&vfsmount_lock); 965 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 966 change_mnt_propagation(m, type); 967 spin_unlock(&vfsmount_lock); 968 up_write(&namespace_sem); 969 return 0; 970 } 971 972 /* 973 * do loopback mount. 974 * noinline this do_mount helper to save do_mount stack space. 975 */ 976 static noinline int do_loopback(struct nameidata *nd, char *old_name, 977 int recurse) 978 { 979 struct nameidata old_nd; 980 struct vfsmount *mnt = NULL; 981 int err = mount_is_safe(nd); 982 if (err) 983 return err; 984 if (!old_name || !*old_name) 985 return -EINVAL; 986 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 987 if (err) 988 return err; 989 990 down_write(&namespace_sem); 991 err = -EINVAL; 992 if (IS_MNT_UNBINDABLE(old_nd.path.mnt)) 993 goto out; 994 995 if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) 996 goto out; 997 998 err = -ENOMEM; 999 if (recurse) 1000 mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0); 1001 else 1002 mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0); 1003 1004 if (!mnt) 1005 goto out; 1006 1007 err = graft_tree(mnt, nd); 1008 if (err) { 1009 LIST_HEAD(umount_list); 1010 spin_lock(&vfsmount_lock); 1011 umount_tree(mnt, 0, &umount_list); 1012 spin_unlock(&vfsmount_lock); 1013 release_mounts(&umount_list); 1014 } 1015 1016 out: 1017 up_write(&namespace_sem); 1018 path_put(&old_nd.path); 1019 return err; 1020 } 1021 1022 /* 1023 * change filesystem flags. dir should be a physical root of filesystem. 1024 * If you've mounted a non-root directory somewhere and want to do remount 1025 * on it - tough luck. 1026 * noinline this do_mount helper to save do_mount stack space. 1027 */ 1028 static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, 1029 void *data) 1030 { 1031 int err; 1032 struct super_block *sb = nd->path.mnt->mnt_sb; 1033 1034 if (!capable(CAP_SYS_ADMIN)) 1035 return -EPERM; 1036 1037 if (!check_mnt(nd->path.mnt)) 1038 return -EINVAL; 1039 1040 if (nd->path.dentry != nd->path.mnt->mnt_root) 1041 return -EINVAL; 1042 1043 down_write(&sb->s_umount); 1044 err = do_remount_sb(sb, flags, data, 0); 1045 if (!err) 1046 nd->path.mnt->mnt_flags = mnt_flags; 1047 up_write(&sb->s_umount); 1048 if (!err) 1049 security_sb_post_remount(nd->path.mnt, flags, data); 1050 return err; 1051 } 1052 1053 static inline int tree_contains_unbindable(struct vfsmount *mnt) 1054 { 1055 struct vfsmount *p; 1056 for (p = mnt; p; p = next_mnt(p, mnt)) { 1057 if (IS_MNT_UNBINDABLE(p)) 1058 return 1; 1059 } 1060 return 0; 1061 } 1062 1063 /* 1064 * noinline this do_mount helper to save do_mount stack space. 1065 */ 1066 static noinline int do_move_mount(struct nameidata *nd, char *old_name) 1067 { 1068 struct nameidata old_nd; 1069 struct path parent_path; 1070 struct vfsmount *p; 1071 int err = 0; 1072 if (!capable(CAP_SYS_ADMIN)) 1073 return -EPERM; 1074 if (!old_name || !*old_name) 1075 return -EINVAL; 1076 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 1077 if (err) 1078 return err; 1079 1080 down_write(&namespace_sem); 1081 while (d_mountpoint(nd->path.dentry) && 1082 follow_down(&nd->path.mnt, &nd->path.dentry)) 1083 ; 1084 err = -EINVAL; 1085 if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) 1086 goto out; 1087 1088 err = -ENOENT; 1089 mutex_lock(&nd->path.dentry->d_inode->i_mutex); 1090 if (IS_DEADDIR(nd->path.dentry->d_inode)) 1091 goto out1; 1092 1093 if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry)) 1094 goto out1; 1095 1096 err = -EINVAL; 1097 if (old_nd.path.dentry != old_nd.path.mnt->mnt_root) 1098 goto out1; 1099 1100 if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent) 1101 goto out1; 1102 1103 if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != 1104 S_ISDIR(old_nd.path.dentry->d_inode->i_mode)) 1105 goto out1; 1106 /* 1107 * Don't move a mount residing in a shared parent. 1108 */ 1109 if (old_nd.path.mnt->mnt_parent && 1110 IS_MNT_SHARED(old_nd.path.mnt->mnt_parent)) 1111 goto out1; 1112 /* 1113 * Don't move a mount tree containing unbindable mounts to a destination 1114 * mount which is shared. 1115 */ 1116 if (IS_MNT_SHARED(nd->path.mnt) && 1117 tree_contains_unbindable(old_nd.path.mnt)) 1118 goto out1; 1119 err = -ELOOP; 1120 for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent) 1121 if (p == old_nd.path.mnt) 1122 goto out1; 1123 1124 err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path); 1125 if (err) 1126 goto out1; 1127 1128 /* if the mount is moved, it should no longer be expire 1129 * automatically */ 1130 list_del_init(&old_nd.path.mnt->mnt_expire); 1131 out1: 1132 mutex_unlock(&nd->path.dentry->d_inode->i_mutex); 1133 out: 1134 up_write(&namespace_sem); 1135 if (!err) 1136 path_put(&parent_path); 1137 path_put(&old_nd.path); 1138 return err; 1139 } 1140 1141 /* 1142 * create a new mount for userspace and request it to be added into the 1143 * namespace's tree 1144 * noinline this do_mount helper to save do_mount stack space. 1145 */ 1146 static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, 1147 int mnt_flags, char *name, void *data) 1148 { 1149 struct vfsmount *mnt; 1150 1151 if (!type || !memchr(type, 0, PAGE_SIZE)) 1152 return -EINVAL; 1153 1154 /* we need capabilities... */ 1155 if (!capable(CAP_SYS_ADMIN)) 1156 return -EPERM; 1157 1158 mnt = do_kern_mount(type, flags, name, data); 1159 if (IS_ERR(mnt)) 1160 return PTR_ERR(mnt); 1161 1162 return do_add_mount(mnt, nd, mnt_flags, NULL); 1163 } 1164 1165 /* 1166 * add a mount into a namespace's mount tree 1167 * - provide the option of adding the new mount to an expiration list 1168 */ 1169 int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, 1170 int mnt_flags, struct list_head *fslist) 1171 { 1172 int err; 1173 1174 down_write(&namespace_sem); 1175 /* Something was mounted here while we slept */ 1176 while (d_mountpoint(nd->path.dentry) && 1177 follow_down(&nd->path.mnt, &nd->path.dentry)) 1178 ; 1179 err = -EINVAL; 1180 if (!check_mnt(nd->path.mnt)) 1181 goto unlock; 1182 1183 /* Refuse the same filesystem on the same mount point */ 1184 err = -EBUSY; 1185 if (nd->path.mnt->mnt_sb == newmnt->mnt_sb && 1186 nd->path.mnt->mnt_root == nd->path.dentry) 1187 goto unlock; 1188 1189 err = -EINVAL; 1190 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) 1191 goto unlock; 1192 1193 newmnt->mnt_flags = mnt_flags; 1194 if ((err = graft_tree(newmnt, nd))) 1195 goto unlock; 1196 1197 if (fslist) /* add to the specified expiration list */ 1198 list_add_tail(&newmnt->mnt_expire, fslist); 1199 1200 up_write(&namespace_sem); 1201 return 0; 1202 1203 unlock: 1204 up_write(&namespace_sem); 1205 mntput(newmnt); 1206 return err; 1207 } 1208 1209 EXPORT_SYMBOL_GPL(do_add_mount); 1210 1211 /* 1212 * process a list of expirable mountpoints with the intent of discarding any 1213 * mountpoints that aren't in use and haven't been touched since last we came 1214 * here 1215 */ 1216 void mark_mounts_for_expiry(struct list_head *mounts) 1217 { 1218 struct vfsmount *mnt, *next; 1219 LIST_HEAD(graveyard); 1220 LIST_HEAD(umounts); 1221 1222 if (list_empty(mounts)) 1223 return; 1224 1225 down_write(&namespace_sem); 1226 spin_lock(&vfsmount_lock); 1227 1228 /* extract from the expiration list every vfsmount that matches the 1229 * following criteria: 1230 * - only referenced by its parent vfsmount 1231 * - still marked for expiry (marked on the last call here; marks are 1232 * cleared by mntput()) 1233 */ 1234 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 1235 if (!xchg(&mnt->mnt_expiry_mark, 1) || 1236 propagate_mount_busy(mnt, 1)) 1237 continue; 1238 list_move(&mnt->mnt_expire, &graveyard); 1239 } 1240 while (!list_empty(&graveyard)) { 1241 mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); 1242 touch_mnt_namespace(mnt->mnt_ns); 1243 umount_tree(mnt, 1, &umounts); 1244 } 1245 spin_unlock(&vfsmount_lock); 1246 up_write(&namespace_sem); 1247 1248 release_mounts(&umounts); 1249 } 1250 1251 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 1252 1253 /* 1254 * Ripoff of 'select_parent()' 1255 * 1256 * search the list of submounts for a given mountpoint, and move any 1257 * shrinkable submounts to the 'graveyard' list. 1258 */ 1259 static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) 1260 { 1261 struct vfsmount *this_parent = parent; 1262 struct list_head *next; 1263 int found = 0; 1264 1265 repeat: 1266 next = this_parent->mnt_mounts.next; 1267 resume: 1268 while (next != &this_parent->mnt_mounts) { 1269 struct list_head *tmp = next; 1270 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); 1271 1272 next = tmp->next; 1273 if (!(mnt->mnt_flags & MNT_SHRINKABLE)) 1274 continue; 1275 /* 1276 * Descend a level if the d_mounts list is non-empty. 1277 */ 1278 if (!list_empty(&mnt->mnt_mounts)) { 1279 this_parent = mnt; 1280 goto repeat; 1281 } 1282 1283 if (!propagate_mount_busy(mnt, 1)) { 1284 list_move_tail(&mnt->mnt_expire, graveyard); 1285 found++; 1286 } 1287 } 1288 /* 1289 * All done at this level ... ascend and resume the search 1290 */ 1291 if (this_parent != parent) { 1292 next = this_parent->mnt_child.next; 1293 this_parent = this_parent->mnt_parent; 1294 goto resume; 1295 } 1296 return found; 1297 } 1298 1299 /* 1300 * process a list of expirable mountpoints with the intent of discarding any 1301 * submounts of a specific parent mountpoint 1302 */ 1303 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 1304 { 1305 LIST_HEAD(graveyard); 1306 struct vfsmount *m; 1307 1308 /* extract submounts of 'mountpoint' from the expiration list */ 1309 while (select_submounts(mnt, &graveyard)) { 1310 while (!list_empty(&graveyard)) { 1311 m = list_first_entry(&graveyard, struct vfsmount, 1312 mnt_expire); 1313 touch_mnt_namespace(mnt->mnt_ns); 1314 umount_tree(mnt, 1, umounts); 1315 } 1316 } 1317 } 1318 1319 /* 1320 * Some copy_from_user() implementations do not return the exact number of 1321 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 1322 * Note that this function differs from copy_from_user() in that it will oops 1323 * on bad values of `to', rather than returning a short copy. 1324 */ 1325 static long exact_copy_from_user(void *to, const void __user * from, 1326 unsigned long n) 1327 { 1328 char *t = to; 1329 const char __user *f = from; 1330 char c; 1331 1332 if (!access_ok(VERIFY_READ, from, n)) 1333 return n; 1334 1335 while (n) { 1336 if (__get_user(c, f)) { 1337 memset(t, 0, n); 1338 break; 1339 } 1340 *t++ = c; 1341 f++; 1342 n--; 1343 } 1344 return n; 1345 } 1346 1347 int copy_mount_options(const void __user * data, unsigned long *where) 1348 { 1349 int i; 1350 unsigned long page; 1351 unsigned long size; 1352 1353 *where = 0; 1354 if (!data) 1355 return 0; 1356 1357 if (!(page = __get_free_page(GFP_KERNEL))) 1358 return -ENOMEM; 1359 1360 /* We only care that *some* data at the address the user 1361 * gave us is valid. Just in case, we'll zero 1362 * the remainder of the page. 1363 */ 1364 /* copy_from_user cannot cross TASK_SIZE ! */ 1365 size = TASK_SIZE - (unsigned long)data; 1366 if (size > PAGE_SIZE) 1367 size = PAGE_SIZE; 1368 1369 i = size - exact_copy_from_user((void *)page, data, size); 1370 if (!i) { 1371 free_page(page); 1372 return -EFAULT; 1373 } 1374 if (i != PAGE_SIZE) 1375 memset((char *)page + i, 0, PAGE_SIZE - i); 1376 *where = page; 1377 return 0; 1378 } 1379 1380 /* 1381 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 1382 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 1383 * 1384 * data is a (void *) that can point to any structure up to 1385 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 1386 * information (or be NULL). 1387 * 1388 * Pre-0.97 versions of mount() didn't have a flags word. 1389 * When the flags word was introduced its top half was required 1390 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 1391 * Therefore, if this magic number is present, it carries no information 1392 * and must be discarded. 1393 */ 1394 long do_mount(char *dev_name, char *dir_name, char *type_page, 1395 unsigned long flags, void *data_page) 1396 { 1397 struct nameidata nd; 1398 int retval = 0; 1399 int mnt_flags = 0; 1400 1401 /* Discard magic */ 1402 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 1403 flags &= ~MS_MGC_MSK; 1404 1405 /* Basic sanity checks */ 1406 1407 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 1408 return -EINVAL; 1409 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) 1410 return -EINVAL; 1411 1412 if (data_page) 1413 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1414 1415 /* Separate the per-mountpoint flags */ 1416 if (flags & MS_NOSUID) 1417 mnt_flags |= MNT_NOSUID; 1418 if (flags & MS_NODEV) 1419 mnt_flags |= MNT_NODEV; 1420 if (flags & MS_NOEXEC) 1421 mnt_flags |= MNT_NOEXEC; 1422 if (flags & MS_NOATIME) 1423 mnt_flags |= MNT_NOATIME; 1424 if (flags & MS_NODIRATIME) 1425 mnt_flags |= MNT_NODIRATIME; 1426 if (flags & MS_RELATIME) 1427 mnt_flags |= MNT_RELATIME; 1428 1429 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1430 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1431 1432 /* ... and get the mountpoint */ 1433 retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); 1434 if (retval) 1435 return retval; 1436 1437 retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); 1438 if (retval) 1439 goto dput_out; 1440 1441 if (flags & MS_REMOUNT) 1442 retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, 1443 data_page); 1444 else if (flags & MS_BIND) 1445 retval = do_loopback(&nd, dev_name, flags & MS_REC); 1446 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1447 retval = do_change_type(&nd, flags); 1448 else if (flags & MS_MOVE) 1449 retval = do_move_mount(&nd, dev_name); 1450 else 1451 retval = do_new_mount(&nd, type_page, flags, mnt_flags, 1452 dev_name, data_page); 1453 dput_out: 1454 path_put(&nd.path); 1455 return retval; 1456 } 1457 1458 /* 1459 * Allocate a new namespace structure and populate it with contents 1460 * copied from the namespace of the passed in task structure. 1461 */ 1462 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 1463 struct fs_struct *fs) 1464 { 1465 struct mnt_namespace *new_ns; 1466 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; 1467 struct vfsmount *p, *q; 1468 1469 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1470 if (!new_ns) 1471 return ERR_PTR(-ENOMEM); 1472 1473 atomic_set(&new_ns->count, 1); 1474 INIT_LIST_HEAD(&new_ns->list); 1475 init_waitqueue_head(&new_ns->poll); 1476 new_ns->event = 0; 1477 1478 down_write(&namespace_sem); 1479 /* First pass: copy the tree topology */ 1480 new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, 1481 CL_COPY_ALL | CL_EXPIRE); 1482 if (!new_ns->root) { 1483 up_write(&namespace_sem); 1484 kfree(new_ns); 1485 return ERR_PTR(-ENOMEM);; 1486 } 1487 spin_lock(&vfsmount_lock); 1488 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1489 spin_unlock(&vfsmount_lock); 1490 1491 /* 1492 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 1493 * as belonging to new namespace. We have already acquired a private 1494 * fs_struct, so tsk->fs->lock is not needed. 1495 */ 1496 p = mnt_ns->root; 1497 q = new_ns->root; 1498 while (p) { 1499 q->mnt_ns = new_ns; 1500 if (fs) { 1501 if (p == fs->root.mnt) { 1502 rootmnt = p; 1503 fs->root.mnt = mntget(q); 1504 } 1505 if (p == fs->pwd.mnt) { 1506 pwdmnt = p; 1507 fs->pwd.mnt = mntget(q); 1508 } 1509 if (p == fs->altroot.mnt) { 1510 altrootmnt = p; 1511 fs->altroot.mnt = mntget(q); 1512 } 1513 } 1514 p = next_mnt(p, mnt_ns->root); 1515 q = next_mnt(q, new_ns->root); 1516 } 1517 up_write(&namespace_sem); 1518 1519 if (rootmnt) 1520 mntput(rootmnt); 1521 if (pwdmnt) 1522 mntput(pwdmnt); 1523 if (altrootmnt) 1524 mntput(altrootmnt); 1525 1526 return new_ns; 1527 } 1528 1529 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 1530 struct fs_struct *new_fs) 1531 { 1532 struct mnt_namespace *new_ns; 1533 1534 BUG_ON(!ns); 1535 get_mnt_ns(ns); 1536 1537 if (!(flags & CLONE_NEWNS)) 1538 return ns; 1539 1540 new_ns = dup_mnt_ns(ns, new_fs); 1541 1542 put_mnt_ns(ns); 1543 return new_ns; 1544 } 1545 1546 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, 1547 char __user * type, unsigned long flags, 1548 void __user * data) 1549 { 1550 int retval; 1551 unsigned long data_page; 1552 unsigned long type_page; 1553 unsigned long dev_page; 1554 char *dir_page; 1555 1556 retval = copy_mount_options(type, &type_page); 1557 if (retval < 0) 1558 return retval; 1559 1560 dir_page = getname(dir_name); 1561 retval = PTR_ERR(dir_page); 1562 if (IS_ERR(dir_page)) 1563 goto out1; 1564 1565 retval = copy_mount_options(dev_name, &dev_page); 1566 if (retval < 0) 1567 goto out2; 1568 1569 retval = copy_mount_options(data, &data_page); 1570 if (retval < 0) 1571 goto out3; 1572 1573 lock_kernel(); 1574 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 1575 flags, (void *)data_page); 1576 unlock_kernel(); 1577 free_page(data_page); 1578 1579 out3: 1580 free_page(dev_page); 1581 out2: 1582 putname(dir_page); 1583 out1: 1584 free_page(type_page); 1585 return retval; 1586 } 1587 1588 /* 1589 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 1590 * It can block. Requires the big lock held. 1591 */ 1592 void set_fs_root(struct fs_struct *fs, struct path *path) 1593 { 1594 struct path old_root; 1595 1596 write_lock(&fs->lock); 1597 old_root = fs->root; 1598 fs->root = *path; 1599 path_get(path); 1600 write_unlock(&fs->lock); 1601 if (old_root.dentry) 1602 path_put(&old_root); 1603 } 1604 1605 /* 1606 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. 1607 * It can block. Requires the big lock held. 1608 */ 1609 void set_fs_pwd(struct fs_struct *fs, struct path *path) 1610 { 1611 struct path old_pwd; 1612 1613 write_lock(&fs->lock); 1614 old_pwd = fs->pwd; 1615 fs->pwd = *path; 1616 path_get(path); 1617 write_unlock(&fs->lock); 1618 1619 if (old_pwd.dentry) 1620 path_put(&old_pwd); 1621 } 1622 1623 static void chroot_fs_refs(struct path *old_root, struct path *new_root) 1624 { 1625 struct task_struct *g, *p; 1626 struct fs_struct *fs; 1627 1628 read_lock(&tasklist_lock); 1629 do_each_thread(g, p) { 1630 task_lock(p); 1631 fs = p->fs; 1632 if (fs) { 1633 atomic_inc(&fs->count); 1634 task_unlock(p); 1635 if (fs->root.dentry == old_root->dentry 1636 && fs->root.mnt == old_root->mnt) 1637 set_fs_root(fs, new_root); 1638 if (fs->pwd.dentry == old_root->dentry 1639 && fs->pwd.mnt == old_root->mnt) 1640 set_fs_pwd(fs, new_root); 1641 put_fs_struct(fs); 1642 } else 1643 task_unlock(p); 1644 } while_each_thread(g, p); 1645 read_unlock(&tasklist_lock); 1646 } 1647 1648 /* 1649 * pivot_root Semantics: 1650 * Moves the root file system of the current process to the directory put_old, 1651 * makes new_root as the new root file system of the current process, and sets 1652 * root/cwd of all processes which had them on the current root to new_root. 1653 * 1654 * Restrictions: 1655 * The new_root and put_old must be directories, and must not be on the 1656 * same file system as the current process root. The put_old must be 1657 * underneath new_root, i.e. adding a non-zero number of /.. to the string 1658 * pointed to by put_old must yield the same directory as new_root. No other 1659 * file system may be mounted on put_old. After all, new_root is a mountpoint. 1660 * 1661 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 1662 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 1663 * in this situation. 1664 * 1665 * Notes: 1666 * - we don't move root/cwd if they are not at the root (reason: if something 1667 * cared enough to change them, it's probably wrong to force them elsewhere) 1668 * - it's okay to pick a root that isn't the root of a file system, e.g. 1669 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 1670 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 1671 * first. 1672 */ 1673 asmlinkage long sys_pivot_root(const char __user * new_root, 1674 const char __user * put_old) 1675 { 1676 struct vfsmount *tmp; 1677 struct nameidata new_nd, old_nd, user_nd; 1678 struct path parent_path, root_parent; 1679 int error; 1680 1681 if (!capable(CAP_SYS_ADMIN)) 1682 return -EPERM; 1683 1684 lock_kernel(); 1685 1686 error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 1687 &new_nd); 1688 if (error) 1689 goto out0; 1690 error = -EINVAL; 1691 if (!check_mnt(new_nd.path.mnt)) 1692 goto out1; 1693 1694 error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); 1695 if (error) 1696 goto out1; 1697 1698 error = security_sb_pivotroot(&old_nd, &new_nd); 1699 if (error) { 1700 path_put(&old_nd.path); 1701 goto out1; 1702 } 1703 1704 read_lock(¤t->fs->lock); 1705 user_nd.path = current->fs->root; 1706 path_get(¤t->fs->root); 1707 read_unlock(¤t->fs->lock); 1708 down_write(&namespace_sem); 1709 mutex_lock(&old_nd.path.dentry->d_inode->i_mutex); 1710 error = -EINVAL; 1711 if (IS_MNT_SHARED(old_nd.path.mnt) || 1712 IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || 1713 IS_MNT_SHARED(user_nd.path.mnt->mnt_parent)) 1714 goto out2; 1715 if (!check_mnt(user_nd.path.mnt)) 1716 goto out2; 1717 error = -ENOENT; 1718 if (IS_DEADDIR(new_nd.path.dentry->d_inode)) 1719 goto out2; 1720 if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry)) 1721 goto out2; 1722 if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) 1723 goto out2; 1724 error = -EBUSY; 1725 if (new_nd.path.mnt == user_nd.path.mnt || 1726 old_nd.path.mnt == user_nd.path.mnt) 1727 goto out2; /* loop, on the same file system */ 1728 error = -EINVAL; 1729 if (user_nd.path.mnt->mnt_root != user_nd.path.dentry) 1730 goto out2; /* not a mountpoint */ 1731 if (user_nd.path.mnt->mnt_parent == user_nd.path.mnt) 1732 goto out2; /* not attached */ 1733 if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) 1734 goto out2; /* not a mountpoint */ 1735 if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt) 1736 goto out2; /* not attached */ 1737 /* make sure we can reach put_old from new_root */ 1738 tmp = old_nd.path.mnt; 1739 spin_lock(&vfsmount_lock); 1740 if (tmp != new_nd.path.mnt) { 1741 for (;;) { 1742 if (tmp->mnt_parent == tmp) 1743 goto out3; /* already mounted on put_old */ 1744 if (tmp->mnt_parent == new_nd.path.mnt) 1745 break; 1746 tmp = tmp->mnt_parent; 1747 } 1748 if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry)) 1749 goto out3; 1750 } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) 1751 goto out3; 1752 detach_mnt(new_nd.path.mnt, &parent_path); 1753 detach_mnt(user_nd.path.mnt, &root_parent); 1754 /* mount old root on put_old */ 1755 attach_mnt(user_nd.path.mnt, &old_nd.path); 1756 /* mount new_root on / */ 1757 attach_mnt(new_nd.path.mnt, &root_parent); 1758 touch_mnt_namespace(current->nsproxy->mnt_ns); 1759 spin_unlock(&vfsmount_lock); 1760 chroot_fs_refs(&user_nd.path, &new_nd.path); 1761 security_sb_post_pivotroot(&user_nd, &new_nd); 1762 error = 0; 1763 path_put(&root_parent); 1764 path_put(&parent_path); 1765 out2: 1766 mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); 1767 up_write(&namespace_sem); 1768 path_put(&user_nd.path); 1769 path_put(&old_nd.path); 1770 out1: 1771 path_put(&new_nd.path); 1772 out0: 1773 unlock_kernel(); 1774 return error; 1775 out3: 1776 spin_unlock(&vfsmount_lock); 1777 goto out2; 1778 } 1779 1780 static void __init init_mount_tree(void) 1781 { 1782 struct vfsmount *mnt; 1783 struct mnt_namespace *ns; 1784 struct path root; 1785 1786 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 1787 if (IS_ERR(mnt)) 1788 panic("Can't create rootfs"); 1789 ns = kmalloc(sizeof(*ns), GFP_KERNEL); 1790 if (!ns) 1791 panic("Can't allocate initial namespace"); 1792 atomic_set(&ns->count, 1); 1793 INIT_LIST_HEAD(&ns->list); 1794 init_waitqueue_head(&ns->poll); 1795 ns->event = 0; 1796 list_add(&mnt->mnt_list, &ns->list); 1797 ns->root = mnt; 1798 mnt->mnt_ns = ns; 1799 1800 init_task.nsproxy->mnt_ns = ns; 1801 get_mnt_ns(ns); 1802 1803 root.mnt = ns->root; 1804 root.dentry = ns->root->mnt_root; 1805 1806 set_fs_pwd(current->fs, &root); 1807 set_fs_root(current->fs, &root); 1808 } 1809 1810 void __init mnt_init(void) 1811 { 1812 unsigned u; 1813 int err; 1814 1815 init_rwsem(&namespace_sem); 1816 1817 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), 1818 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 1819 1820 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 1821 1822 if (!mount_hashtable) 1823 panic("Failed to allocate mount hash table\n"); 1824 1825 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 1826 1827 for (u = 0; u < HASH_SIZE; u++) 1828 INIT_LIST_HEAD(&mount_hashtable[u]); 1829 1830 err = sysfs_init(); 1831 if (err) 1832 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 1833 __FUNCTION__, err); 1834 fs_kobj = kobject_create_and_add("fs", NULL); 1835 if (!fs_kobj) 1836 printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__); 1837 init_rootfs(); 1838 init_mount_tree(); 1839 } 1840 1841 void __put_mnt_ns(struct mnt_namespace *ns) 1842 { 1843 struct vfsmount *root = ns->root; 1844 LIST_HEAD(umount_list); 1845 ns->root = NULL; 1846 spin_unlock(&vfsmount_lock); 1847 down_write(&namespace_sem); 1848 spin_lock(&vfsmount_lock); 1849 umount_tree(root, 0, &umount_list); 1850 spin_unlock(&vfsmount_lock); 1851 up_write(&namespace_sem); 1852 release_mounts(&umount_list); 1853 kfree(ns); 1854 } 1855