1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/slab.h> 13 #include <linux/sched.h> 14 #include <linux/smp_lock.h> 15 #include <linux/init.h> 16 #include <linux/kernel.h> 17 #include <linux/acct.h> 18 #include <linux/capability.h> 19 #include <linux/cpumask.h> 20 #include <linux/module.h> 21 #include <linux/sysfs.h> 22 #include <linux/seq_file.h> 23 #include <linux/mnt_namespace.h> 24 #include <linux/namei.h> 25 #include <linux/security.h> 26 #include <linux/mount.h> 27 #include <linux/ramfs.h> 28 #include <linux/log2.h> 29 #include <linux/idr.h> 30 #include <asm/uaccess.h> 31 #include <asm/unistd.h> 32 #include "pnode.h" 33 #include "internal.h" 34 35 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 36 #define HASH_SIZE (1UL << HASH_SHIFT) 37 38 /* spinlock for vfsmount related operations, inplace of dcache_lock */ 39 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 40 41 static int event; 42 static DEFINE_IDA(mnt_id_ida); 43 static DEFINE_IDA(mnt_group_ida); 44 45 static struct list_head *mount_hashtable __read_mostly; 46 static struct kmem_cache *mnt_cache __read_mostly; 47 static struct rw_semaphore namespace_sem; 48 49 /* /sys/fs */ 50 struct kobject *fs_kobj; 51 EXPORT_SYMBOL_GPL(fs_kobj); 52 53 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 54 { 55 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 56 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 57 tmp = tmp + (tmp >> HASH_SHIFT); 58 return tmp & (HASH_SIZE - 1); 59 } 60 61 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 62 63 /* allocation is serialized by namespace_sem */ 64 static int mnt_alloc_id(struct vfsmount *mnt) 65 { 66 int res; 67 68 retry: 69 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 70 spin_lock(&vfsmount_lock); 71 res = ida_get_new(&mnt_id_ida, &mnt->mnt_id); 72 spin_unlock(&vfsmount_lock); 73 if (res == -EAGAIN) 74 goto retry; 75 76 return res; 77 } 78 79 static void mnt_free_id(struct vfsmount *mnt) 80 { 81 spin_lock(&vfsmount_lock); 82 ida_remove(&mnt_id_ida, mnt->mnt_id); 83 spin_unlock(&vfsmount_lock); 84 } 85 86 /* 87 * Allocate a new peer group ID 88 * 89 * mnt_group_ida is protected by namespace_sem 90 */ 91 static int mnt_alloc_group_id(struct vfsmount *mnt) 92 { 93 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 94 return -ENOMEM; 95 96 return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id); 97 } 98 99 /* 100 * Release a peer group ID 101 */ 102 void mnt_release_group_id(struct vfsmount *mnt) 103 { 104 ida_remove(&mnt_group_ida, mnt->mnt_group_id); 105 mnt->mnt_group_id = 0; 106 } 107 108 struct vfsmount *alloc_vfsmnt(const char *name) 109 { 110 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 111 if (mnt) { 112 int err; 113 114 err = mnt_alloc_id(mnt); 115 if (err) { 116 kmem_cache_free(mnt_cache, mnt); 117 return NULL; 118 } 119 120 atomic_set(&mnt->mnt_count, 1); 121 INIT_LIST_HEAD(&mnt->mnt_hash); 122 INIT_LIST_HEAD(&mnt->mnt_child); 123 INIT_LIST_HEAD(&mnt->mnt_mounts); 124 INIT_LIST_HEAD(&mnt->mnt_list); 125 INIT_LIST_HEAD(&mnt->mnt_expire); 126 INIT_LIST_HEAD(&mnt->mnt_share); 127 INIT_LIST_HEAD(&mnt->mnt_slave_list); 128 INIT_LIST_HEAD(&mnt->mnt_slave); 129 atomic_set(&mnt->__mnt_writers, 0); 130 if (name) { 131 int size = strlen(name) + 1; 132 char *newname = kmalloc(size, GFP_KERNEL); 133 if (newname) { 134 memcpy(newname, name, size); 135 mnt->mnt_devname = newname; 136 } 137 } 138 } 139 return mnt; 140 } 141 142 /* 143 * Most r/o checks on a fs are for operations that take 144 * discrete amounts of time, like a write() or unlink(). 145 * We must keep track of when those operations start 146 * (for permission checks) and when they end, so that 147 * we can determine when writes are able to occur to 148 * a filesystem. 149 */ 150 /* 151 * __mnt_is_readonly: check whether a mount is read-only 152 * @mnt: the mount to check for its write status 153 * 154 * This shouldn't be used directly ouside of the VFS. 155 * It does not guarantee that the filesystem will stay 156 * r/w, just that it is right *now*. This can not and 157 * should not be used in place of IS_RDONLY(inode). 158 * mnt_want/drop_write() will _keep_ the filesystem 159 * r/w. 160 */ 161 int __mnt_is_readonly(struct vfsmount *mnt) 162 { 163 if (mnt->mnt_flags & MNT_READONLY) 164 return 1; 165 if (mnt->mnt_sb->s_flags & MS_RDONLY) 166 return 1; 167 return 0; 168 } 169 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 170 171 struct mnt_writer { 172 /* 173 * If holding multiple instances of this lock, they 174 * must be ordered by cpu number. 175 */ 176 spinlock_t lock; 177 struct lock_class_key lock_class; /* compiles out with !lockdep */ 178 unsigned long count; 179 struct vfsmount *mnt; 180 } ____cacheline_aligned_in_smp; 181 static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); 182 183 static int __init init_mnt_writers(void) 184 { 185 int cpu; 186 for_each_possible_cpu(cpu) { 187 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 188 spin_lock_init(&writer->lock); 189 lockdep_set_class(&writer->lock, &writer->lock_class); 190 writer->count = 0; 191 } 192 return 0; 193 } 194 fs_initcall(init_mnt_writers); 195 196 static void unlock_mnt_writers(void) 197 { 198 int cpu; 199 struct mnt_writer *cpu_writer; 200 201 for_each_possible_cpu(cpu) { 202 cpu_writer = &per_cpu(mnt_writers, cpu); 203 spin_unlock(&cpu_writer->lock); 204 } 205 } 206 207 static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 208 { 209 if (!cpu_writer->mnt) 210 return; 211 /* 212 * This is in case anyone ever leaves an invalid, 213 * old ->mnt and a count of 0. 214 */ 215 if (!cpu_writer->count) 216 return; 217 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); 218 cpu_writer->count = 0; 219 } 220 /* 221 * must hold cpu_writer->lock 222 */ 223 static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, 224 struct vfsmount *mnt) 225 { 226 if (cpu_writer->mnt == mnt) 227 return; 228 __clear_mnt_count(cpu_writer); 229 cpu_writer->mnt = mnt; 230 } 231 232 /* 233 * Most r/o checks on a fs are for operations that take 234 * discrete amounts of time, like a write() or unlink(). 235 * We must keep track of when those operations start 236 * (for permission checks) and when they end, so that 237 * we can determine when writes are able to occur to 238 * a filesystem. 239 */ 240 /** 241 * mnt_want_write - get write access to a mount 242 * @mnt: the mount on which to take a write 243 * 244 * This tells the low-level filesystem that a write is 245 * about to be performed to it, and makes sure that 246 * writes are allowed before returning success. When 247 * the write operation is finished, mnt_drop_write() 248 * must be called. This is effectively a refcount. 249 */ 250 int mnt_want_write(struct vfsmount *mnt) 251 { 252 int ret = 0; 253 struct mnt_writer *cpu_writer; 254 255 cpu_writer = &get_cpu_var(mnt_writers); 256 spin_lock(&cpu_writer->lock); 257 if (__mnt_is_readonly(mnt)) { 258 ret = -EROFS; 259 goto out; 260 } 261 use_cpu_writer_for_mount(cpu_writer, mnt); 262 cpu_writer->count++; 263 out: 264 spin_unlock(&cpu_writer->lock); 265 put_cpu_var(mnt_writers); 266 return ret; 267 } 268 EXPORT_SYMBOL_GPL(mnt_want_write); 269 270 static void lock_mnt_writers(void) 271 { 272 int cpu; 273 struct mnt_writer *cpu_writer; 274 275 for_each_possible_cpu(cpu) { 276 cpu_writer = &per_cpu(mnt_writers, cpu); 277 spin_lock(&cpu_writer->lock); 278 __clear_mnt_count(cpu_writer); 279 cpu_writer->mnt = NULL; 280 } 281 } 282 283 /* 284 * These per-cpu write counts are not guaranteed to have 285 * matched increments and decrements on any given cpu. 286 * A file open()ed for write on one cpu and close()d on 287 * another cpu will imbalance this count. Make sure it 288 * does not get too far out of whack. 289 */ 290 static void handle_write_count_underflow(struct vfsmount *mnt) 291 { 292 if (atomic_read(&mnt->__mnt_writers) >= 293 MNT_WRITER_UNDERFLOW_LIMIT) 294 return; 295 /* 296 * It isn't necessary to hold all of the locks 297 * at the same time, but doing it this way makes 298 * us share a lot more code. 299 */ 300 lock_mnt_writers(); 301 /* 302 * vfsmount_lock is for mnt_flags. 303 */ 304 spin_lock(&vfsmount_lock); 305 /* 306 * If coalescing the per-cpu writer counts did not 307 * get us back to a positive writer count, we have 308 * a bug. 309 */ 310 if ((atomic_read(&mnt->__mnt_writers) < 0) && 311 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { 312 printk(KERN_DEBUG "leak detected on mount(%p) writers " 313 "count: %d\n", 314 mnt, atomic_read(&mnt->__mnt_writers)); 315 WARN_ON(1); 316 /* use the flag to keep the dmesg spam down */ 317 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; 318 } 319 spin_unlock(&vfsmount_lock); 320 unlock_mnt_writers(); 321 } 322 323 /** 324 * mnt_drop_write - give up write access to a mount 325 * @mnt: the mount on which to give up write access 326 * 327 * Tells the low-level filesystem that we are done 328 * performing writes to it. Must be matched with 329 * mnt_want_write() call above. 330 */ 331 void mnt_drop_write(struct vfsmount *mnt) 332 { 333 int must_check_underflow = 0; 334 struct mnt_writer *cpu_writer; 335 336 cpu_writer = &get_cpu_var(mnt_writers); 337 spin_lock(&cpu_writer->lock); 338 339 use_cpu_writer_for_mount(cpu_writer, mnt); 340 if (cpu_writer->count > 0) { 341 cpu_writer->count--; 342 } else { 343 must_check_underflow = 1; 344 atomic_dec(&mnt->__mnt_writers); 345 } 346 347 spin_unlock(&cpu_writer->lock); 348 /* 349 * Logically, we could call this each time, 350 * but the __mnt_writers cacheline tends to 351 * be cold, and makes this expensive. 352 */ 353 if (must_check_underflow) 354 handle_write_count_underflow(mnt); 355 /* 356 * This could be done right after the spinlock 357 * is taken because the spinlock keeps us on 358 * the cpu, and disables preemption. However, 359 * putting it here bounds the amount that 360 * __mnt_writers can underflow. Without it, 361 * we could theoretically wrap __mnt_writers. 362 */ 363 put_cpu_var(mnt_writers); 364 } 365 EXPORT_SYMBOL_GPL(mnt_drop_write); 366 367 static int mnt_make_readonly(struct vfsmount *mnt) 368 { 369 int ret = 0; 370 371 lock_mnt_writers(); 372 /* 373 * With all the locks held, this value is stable 374 */ 375 if (atomic_read(&mnt->__mnt_writers) > 0) { 376 ret = -EBUSY; 377 goto out; 378 } 379 /* 380 * nobody can do a successful mnt_want_write() with all 381 * of the counts in MNT_DENIED_WRITE and the locks held. 382 */ 383 spin_lock(&vfsmount_lock); 384 if (!ret) 385 mnt->mnt_flags |= MNT_READONLY; 386 spin_unlock(&vfsmount_lock); 387 out: 388 unlock_mnt_writers(); 389 return ret; 390 } 391 392 static void __mnt_unmake_readonly(struct vfsmount *mnt) 393 { 394 spin_lock(&vfsmount_lock); 395 mnt->mnt_flags &= ~MNT_READONLY; 396 spin_unlock(&vfsmount_lock); 397 } 398 399 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 400 { 401 mnt->mnt_sb = sb; 402 mnt->mnt_root = dget(sb->s_root); 403 return 0; 404 } 405 406 EXPORT_SYMBOL(simple_set_mnt); 407 408 void free_vfsmnt(struct vfsmount *mnt) 409 { 410 kfree(mnt->mnt_devname); 411 mnt_free_id(mnt); 412 kmem_cache_free(mnt_cache, mnt); 413 } 414 415 /* 416 * find the first or last mount at @dentry on vfsmount @mnt depending on 417 * @dir. If @dir is set return the first mount else return the last mount. 418 */ 419 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 420 int dir) 421 { 422 struct list_head *head = mount_hashtable + hash(mnt, dentry); 423 struct list_head *tmp = head; 424 struct vfsmount *p, *found = NULL; 425 426 for (;;) { 427 tmp = dir ? tmp->next : tmp->prev; 428 p = NULL; 429 if (tmp == head) 430 break; 431 p = list_entry(tmp, struct vfsmount, mnt_hash); 432 if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { 433 found = p; 434 break; 435 } 436 } 437 return found; 438 } 439 440 /* 441 * lookup_mnt increments the ref count before returning 442 * the vfsmount struct. 443 */ 444 struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 445 { 446 struct vfsmount *child_mnt; 447 spin_lock(&vfsmount_lock); 448 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 449 mntget(child_mnt); 450 spin_unlock(&vfsmount_lock); 451 return child_mnt; 452 } 453 454 static inline int check_mnt(struct vfsmount *mnt) 455 { 456 return mnt->mnt_ns == current->nsproxy->mnt_ns; 457 } 458 459 static void touch_mnt_namespace(struct mnt_namespace *ns) 460 { 461 if (ns) { 462 ns->event = ++event; 463 wake_up_interruptible(&ns->poll); 464 } 465 } 466 467 static void __touch_mnt_namespace(struct mnt_namespace *ns) 468 { 469 if (ns && ns->event != event) { 470 ns->event = event; 471 wake_up_interruptible(&ns->poll); 472 } 473 } 474 475 static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 476 { 477 old_path->dentry = mnt->mnt_mountpoint; 478 old_path->mnt = mnt->mnt_parent; 479 mnt->mnt_parent = mnt; 480 mnt->mnt_mountpoint = mnt->mnt_root; 481 list_del_init(&mnt->mnt_child); 482 list_del_init(&mnt->mnt_hash); 483 old_path->dentry->d_mounted--; 484 } 485 486 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 487 struct vfsmount *child_mnt) 488 { 489 child_mnt->mnt_parent = mntget(mnt); 490 child_mnt->mnt_mountpoint = dget(dentry); 491 dentry->d_mounted++; 492 } 493 494 static void attach_mnt(struct vfsmount *mnt, struct path *path) 495 { 496 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 497 list_add_tail(&mnt->mnt_hash, mount_hashtable + 498 hash(path->mnt, path->dentry)); 499 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 500 } 501 502 /* 503 * the caller must hold vfsmount_lock 504 */ 505 static void commit_tree(struct vfsmount *mnt) 506 { 507 struct vfsmount *parent = mnt->mnt_parent; 508 struct vfsmount *m; 509 LIST_HEAD(head); 510 struct mnt_namespace *n = parent->mnt_ns; 511 512 BUG_ON(parent == mnt); 513 514 list_add_tail(&head, &mnt->mnt_list); 515 list_for_each_entry(m, &head, mnt_list) 516 m->mnt_ns = n; 517 list_splice(&head, n->list.prev); 518 519 list_add_tail(&mnt->mnt_hash, mount_hashtable + 520 hash(parent, mnt->mnt_mountpoint)); 521 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 522 touch_mnt_namespace(n); 523 } 524 525 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) 526 { 527 struct list_head *next = p->mnt_mounts.next; 528 if (next == &p->mnt_mounts) { 529 while (1) { 530 if (p == root) 531 return NULL; 532 next = p->mnt_child.next; 533 if (next != &p->mnt_parent->mnt_mounts) 534 break; 535 p = p->mnt_parent; 536 } 537 } 538 return list_entry(next, struct vfsmount, mnt_child); 539 } 540 541 static struct vfsmount *skip_mnt_tree(struct vfsmount *p) 542 { 543 struct list_head *prev = p->mnt_mounts.prev; 544 while (prev != &p->mnt_mounts) { 545 p = list_entry(prev, struct vfsmount, mnt_child); 546 prev = p->mnt_mounts.prev; 547 } 548 return p; 549 } 550 551 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 552 int flag) 553 { 554 struct super_block *sb = old->mnt_sb; 555 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 556 557 if (mnt) { 558 if (flag & (CL_SLAVE | CL_PRIVATE)) 559 mnt->mnt_group_id = 0; /* not a peer of original */ 560 else 561 mnt->mnt_group_id = old->mnt_group_id; 562 563 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 564 int err = mnt_alloc_group_id(mnt); 565 if (err) 566 goto out_free; 567 } 568 569 mnt->mnt_flags = old->mnt_flags; 570 atomic_inc(&sb->s_active); 571 mnt->mnt_sb = sb; 572 mnt->mnt_root = dget(root); 573 mnt->mnt_mountpoint = mnt->mnt_root; 574 mnt->mnt_parent = mnt; 575 576 if (flag & CL_SLAVE) { 577 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 578 mnt->mnt_master = old; 579 CLEAR_MNT_SHARED(mnt); 580 } else if (!(flag & CL_PRIVATE)) { 581 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 582 list_add(&mnt->mnt_share, &old->mnt_share); 583 if (IS_MNT_SLAVE(old)) 584 list_add(&mnt->mnt_slave, &old->mnt_slave); 585 mnt->mnt_master = old->mnt_master; 586 } 587 if (flag & CL_MAKE_SHARED) 588 set_mnt_shared(mnt); 589 590 /* stick the duplicate mount on the same expiry list 591 * as the original if that was on one */ 592 if (flag & CL_EXPIRE) { 593 if (!list_empty(&old->mnt_expire)) 594 list_add(&mnt->mnt_expire, &old->mnt_expire); 595 } 596 } 597 return mnt; 598 599 out_free: 600 free_vfsmnt(mnt); 601 return NULL; 602 } 603 604 static inline void __mntput(struct vfsmount *mnt) 605 { 606 int cpu; 607 struct super_block *sb = mnt->mnt_sb; 608 /* 609 * We don't have to hold all of the locks at the 610 * same time here because we know that we're the 611 * last reference to mnt and that no new writers 612 * can come in. 613 */ 614 for_each_possible_cpu(cpu) { 615 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); 616 if (cpu_writer->mnt != mnt) 617 continue; 618 spin_lock(&cpu_writer->lock); 619 atomic_add(cpu_writer->count, &mnt->__mnt_writers); 620 cpu_writer->count = 0; 621 /* 622 * Might as well do this so that no one 623 * ever sees the pointer and expects 624 * it to be valid. 625 */ 626 cpu_writer->mnt = NULL; 627 spin_unlock(&cpu_writer->lock); 628 } 629 /* 630 * This probably indicates that somebody messed 631 * up a mnt_want/drop_write() pair. If this 632 * happens, the filesystem was probably unable 633 * to make r/w->r/o transitions. 634 */ 635 WARN_ON(atomic_read(&mnt->__mnt_writers)); 636 dput(mnt->mnt_root); 637 free_vfsmnt(mnt); 638 deactivate_super(sb); 639 } 640 641 void mntput_no_expire(struct vfsmount *mnt) 642 { 643 repeat: 644 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 645 if (likely(!mnt->mnt_pinned)) { 646 spin_unlock(&vfsmount_lock); 647 __mntput(mnt); 648 return; 649 } 650 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); 651 mnt->mnt_pinned = 0; 652 spin_unlock(&vfsmount_lock); 653 acct_auto_close_mnt(mnt); 654 security_sb_umount_close(mnt); 655 goto repeat; 656 } 657 } 658 659 EXPORT_SYMBOL(mntput_no_expire); 660 661 void mnt_pin(struct vfsmount *mnt) 662 { 663 spin_lock(&vfsmount_lock); 664 mnt->mnt_pinned++; 665 spin_unlock(&vfsmount_lock); 666 } 667 668 EXPORT_SYMBOL(mnt_pin); 669 670 void mnt_unpin(struct vfsmount *mnt) 671 { 672 spin_lock(&vfsmount_lock); 673 if (mnt->mnt_pinned) { 674 atomic_inc(&mnt->mnt_count); 675 mnt->mnt_pinned--; 676 } 677 spin_unlock(&vfsmount_lock); 678 } 679 680 EXPORT_SYMBOL(mnt_unpin); 681 682 static inline void mangle(struct seq_file *m, const char *s) 683 { 684 seq_escape(m, s, " \t\n\\"); 685 } 686 687 /* 688 * Simple .show_options callback for filesystems which don't want to 689 * implement more complex mount option showing. 690 * 691 * See also save_mount_options(). 692 */ 693 int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 694 { 695 const char *options = mnt->mnt_sb->s_options; 696 697 if (options != NULL && options[0]) { 698 seq_putc(m, ','); 699 mangle(m, options); 700 } 701 702 return 0; 703 } 704 EXPORT_SYMBOL(generic_show_options); 705 706 /* 707 * If filesystem uses generic_show_options(), this function should be 708 * called from the fill_super() callback. 709 * 710 * The .remount_fs callback usually needs to be handled in a special 711 * way, to make sure, that previous options are not overwritten if the 712 * remount fails. 713 * 714 * Also note, that if the filesystem's .remount_fs function doesn't 715 * reset all options to their default value, but changes only newly 716 * given options, then the displayed options will not reflect reality 717 * any more. 718 */ 719 void save_mount_options(struct super_block *sb, char *options) 720 { 721 kfree(sb->s_options); 722 sb->s_options = kstrdup(options, GFP_KERNEL); 723 } 724 EXPORT_SYMBOL(save_mount_options); 725 726 #ifdef CONFIG_PROC_FS 727 /* iterator */ 728 static void *m_start(struct seq_file *m, loff_t *pos) 729 { 730 struct proc_mounts *p = m->private; 731 732 down_read(&namespace_sem); 733 return seq_list_start(&p->ns->list, *pos); 734 } 735 736 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 737 { 738 struct proc_mounts *p = m->private; 739 740 return seq_list_next(v, &p->ns->list, pos); 741 } 742 743 static void m_stop(struct seq_file *m, void *v) 744 { 745 up_read(&namespace_sem); 746 } 747 748 struct proc_fs_info { 749 int flag; 750 const char *str; 751 }; 752 753 static void show_sb_opts(struct seq_file *m, struct super_block *sb) 754 { 755 static const struct proc_fs_info fs_info[] = { 756 { MS_SYNCHRONOUS, ",sync" }, 757 { MS_DIRSYNC, ",dirsync" }, 758 { MS_MANDLOCK, ",mand" }, 759 { 0, NULL } 760 }; 761 const struct proc_fs_info *fs_infop; 762 763 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 764 if (sb->s_flags & fs_infop->flag) 765 seq_puts(m, fs_infop->str); 766 } 767 } 768 769 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) 770 { 771 static const struct proc_fs_info mnt_info[] = { 772 { MNT_NOSUID, ",nosuid" }, 773 { MNT_NODEV, ",nodev" }, 774 { MNT_NOEXEC, ",noexec" }, 775 { MNT_NOATIME, ",noatime" }, 776 { MNT_NODIRATIME, ",nodiratime" }, 777 { MNT_RELATIME, ",relatime" }, 778 { 0, NULL } 779 }; 780 const struct proc_fs_info *fs_infop; 781 782 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { 783 if (mnt->mnt_flags & fs_infop->flag) 784 seq_puts(m, fs_infop->str); 785 } 786 } 787 788 static void show_type(struct seq_file *m, struct super_block *sb) 789 { 790 mangle(m, sb->s_type->name); 791 if (sb->s_subtype && sb->s_subtype[0]) { 792 seq_putc(m, '.'); 793 mangle(m, sb->s_subtype); 794 } 795 } 796 797 static int show_vfsmnt(struct seq_file *m, void *v) 798 { 799 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 800 int err = 0; 801 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 802 803 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 804 seq_putc(m, ' '); 805 seq_path(m, &mnt_path, " \t\n\\"); 806 seq_putc(m, ' '); 807 show_type(m, mnt->mnt_sb); 808 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); 809 show_sb_opts(m, mnt->mnt_sb); 810 show_mnt_opts(m, mnt); 811 if (mnt->mnt_sb->s_op->show_options) 812 err = mnt->mnt_sb->s_op->show_options(m, mnt); 813 seq_puts(m, " 0 0\n"); 814 return err; 815 } 816 817 const struct seq_operations mounts_op = { 818 .start = m_start, 819 .next = m_next, 820 .stop = m_stop, 821 .show = show_vfsmnt 822 }; 823 824 static int show_mountinfo(struct seq_file *m, void *v) 825 { 826 struct proc_mounts *p = m->private; 827 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 828 struct super_block *sb = mnt->mnt_sb; 829 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 830 struct path root = p->root; 831 int err = 0; 832 833 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, 834 MAJOR(sb->s_dev), MINOR(sb->s_dev)); 835 seq_dentry(m, mnt->mnt_root, " \t\n\\"); 836 seq_putc(m, ' '); 837 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 838 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 839 /* 840 * Mountpoint is outside root, discard that one. Ugly, 841 * but less so than trying to do that in iterator in a 842 * race-free way (due to renames). 843 */ 844 return SEQ_SKIP; 845 } 846 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); 847 show_mnt_opts(m, mnt); 848 849 /* Tagged fields ("foo:X" or "bar") */ 850 if (IS_MNT_SHARED(mnt)) 851 seq_printf(m, " shared:%i", mnt->mnt_group_id); 852 if (IS_MNT_SLAVE(mnt)) { 853 int master = mnt->mnt_master->mnt_group_id; 854 int dom = get_dominating_id(mnt, &p->root); 855 seq_printf(m, " master:%i", master); 856 if (dom && dom != master) 857 seq_printf(m, " propagate_from:%i", dom); 858 } 859 if (IS_MNT_UNBINDABLE(mnt)) 860 seq_puts(m, " unbindable"); 861 862 /* Filesystem specific data */ 863 seq_puts(m, " - "); 864 show_type(m, sb); 865 seq_putc(m, ' '); 866 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 867 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 868 show_sb_opts(m, sb); 869 if (sb->s_op->show_options) 870 err = sb->s_op->show_options(m, mnt); 871 seq_putc(m, '\n'); 872 return err; 873 } 874 875 const struct seq_operations mountinfo_op = { 876 .start = m_start, 877 .next = m_next, 878 .stop = m_stop, 879 .show = show_mountinfo, 880 }; 881 882 static int show_vfsstat(struct seq_file *m, void *v) 883 { 884 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 885 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 886 int err = 0; 887 888 /* device */ 889 if (mnt->mnt_devname) { 890 seq_puts(m, "device "); 891 mangle(m, mnt->mnt_devname); 892 } else 893 seq_puts(m, "no device"); 894 895 /* mount point */ 896 seq_puts(m, " mounted on "); 897 seq_path(m, &mnt_path, " \t\n\\"); 898 seq_putc(m, ' '); 899 900 /* file system type */ 901 seq_puts(m, "with fstype "); 902 show_type(m, mnt->mnt_sb); 903 904 /* optional statistics */ 905 if (mnt->mnt_sb->s_op->show_stats) { 906 seq_putc(m, ' '); 907 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 908 } 909 910 seq_putc(m, '\n'); 911 return err; 912 } 913 914 const struct seq_operations mountstats_op = { 915 .start = m_start, 916 .next = m_next, 917 .stop = m_stop, 918 .show = show_vfsstat, 919 }; 920 #endif /* CONFIG_PROC_FS */ 921 922 /** 923 * may_umount_tree - check if a mount tree is busy 924 * @mnt: root of mount tree 925 * 926 * This is called to check if a tree of mounts has any 927 * open files, pwds, chroots or sub mounts that are 928 * busy. 929 */ 930 int may_umount_tree(struct vfsmount *mnt) 931 { 932 int actual_refs = 0; 933 int minimum_refs = 0; 934 struct vfsmount *p; 935 936 spin_lock(&vfsmount_lock); 937 for (p = mnt; p; p = next_mnt(p, mnt)) { 938 actual_refs += atomic_read(&p->mnt_count); 939 minimum_refs += 2; 940 } 941 spin_unlock(&vfsmount_lock); 942 943 if (actual_refs > minimum_refs) 944 return 0; 945 946 return 1; 947 } 948 949 EXPORT_SYMBOL(may_umount_tree); 950 951 /** 952 * may_umount - check if a mount point is busy 953 * @mnt: root of mount 954 * 955 * This is called to check if a mount point has any 956 * open files, pwds, chroots or sub mounts. If the 957 * mount has sub mounts this will return busy 958 * regardless of whether the sub mounts are busy. 959 * 960 * Doesn't take quota and stuff into account. IOW, in some cases it will 961 * give false negatives. The main reason why it's here is that we need 962 * a non-destructive way to look for easily umountable filesystems. 963 */ 964 int may_umount(struct vfsmount *mnt) 965 { 966 int ret = 1; 967 spin_lock(&vfsmount_lock); 968 if (propagate_mount_busy(mnt, 2)) 969 ret = 0; 970 spin_unlock(&vfsmount_lock); 971 return ret; 972 } 973 974 EXPORT_SYMBOL(may_umount); 975 976 void release_mounts(struct list_head *head) 977 { 978 struct vfsmount *mnt; 979 while (!list_empty(head)) { 980 mnt = list_first_entry(head, struct vfsmount, mnt_hash); 981 list_del_init(&mnt->mnt_hash); 982 if (mnt->mnt_parent != mnt) { 983 struct dentry *dentry; 984 struct vfsmount *m; 985 spin_lock(&vfsmount_lock); 986 dentry = mnt->mnt_mountpoint; 987 m = mnt->mnt_parent; 988 mnt->mnt_mountpoint = mnt->mnt_root; 989 mnt->mnt_parent = mnt; 990 m->mnt_ghosts--; 991 spin_unlock(&vfsmount_lock); 992 dput(dentry); 993 mntput(m); 994 } 995 mntput(mnt); 996 } 997 } 998 999 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1000 { 1001 struct vfsmount *p; 1002 1003 for (p = mnt; p; p = next_mnt(p, mnt)) 1004 list_move(&p->mnt_hash, kill); 1005 1006 if (propagate) 1007 propagate_umount(kill); 1008 1009 list_for_each_entry(p, kill, mnt_hash) { 1010 list_del_init(&p->mnt_expire); 1011 list_del_init(&p->mnt_list); 1012 __touch_mnt_namespace(p->mnt_ns); 1013 p->mnt_ns = NULL; 1014 list_del_init(&p->mnt_child); 1015 if (p->mnt_parent != p) { 1016 p->mnt_parent->mnt_ghosts++; 1017 p->mnt_mountpoint->d_mounted--; 1018 } 1019 change_mnt_propagation(p, MS_PRIVATE); 1020 } 1021 } 1022 1023 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1024 1025 static int do_umount(struct vfsmount *mnt, int flags) 1026 { 1027 struct super_block *sb = mnt->mnt_sb; 1028 int retval; 1029 LIST_HEAD(umount_list); 1030 1031 retval = security_sb_umount(mnt, flags); 1032 if (retval) 1033 return retval; 1034 1035 /* 1036 * Allow userspace to request a mountpoint be expired rather than 1037 * unmounting unconditionally. Unmount only happens if: 1038 * (1) the mark is already set (the mark is cleared by mntput()) 1039 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1040 */ 1041 if (flags & MNT_EXPIRE) { 1042 if (mnt == current->fs->root.mnt || 1043 flags & (MNT_FORCE | MNT_DETACH)) 1044 return -EINVAL; 1045 1046 if (atomic_read(&mnt->mnt_count) != 2) 1047 return -EBUSY; 1048 1049 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1050 return -EAGAIN; 1051 } 1052 1053 /* 1054 * If we may have to abort operations to get out of this 1055 * mount, and they will themselves hold resources we must 1056 * allow the fs to do things. In the Unix tradition of 1057 * 'Gee thats tricky lets do it in userspace' the umount_begin 1058 * might fail to complete on the first run through as other tasks 1059 * must return, and the like. Thats for the mount program to worry 1060 * about for the moment. 1061 */ 1062 1063 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1064 lock_kernel(); 1065 sb->s_op->umount_begin(sb); 1066 unlock_kernel(); 1067 } 1068 1069 /* 1070 * No sense to grab the lock for this test, but test itself looks 1071 * somewhat bogus. Suggestions for better replacement? 1072 * Ho-hum... In principle, we might treat that as umount + switch 1073 * to rootfs. GC would eventually take care of the old vfsmount. 1074 * Actually it makes sense, especially if rootfs would contain a 1075 * /reboot - static binary that would close all descriptors and 1076 * call reboot(9). Then init(8) could umount root and exec /reboot. 1077 */ 1078 if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1079 /* 1080 * Special case for "unmounting" root ... 1081 * we just try to remount it readonly. 1082 */ 1083 down_write(&sb->s_umount); 1084 if (!(sb->s_flags & MS_RDONLY)) { 1085 lock_kernel(); 1086 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1087 unlock_kernel(); 1088 } 1089 up_write(&sb->s_umount); 1090 return retval; 1091 } 1092 1093 down_write(&namespace_sem); 1094 spin_lock(&vfsmount_lock); 1095 event++; 1096 1097 if (!(flags & MNT_DETACH)) 1098 shrink_submounts(mnt, &umount_list); 1099 1100 retval = -EBUSY; 1101 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 1102 if (!list_empty(&mnt->mnt_list)) 1103 umount_tree(mnt, 1, &umount_list); 1104 retval = 0; 1105 } 1106 spin_unlock(&vfsmount_lock); 1107 if (retval) 1108 security_sb_umount_busy(mnt); 1109 up_write(&namespace_sem); 1110 release_mounts(&umount_list); 1111 return retval; 1112 } 1113 1114 /* 1115 * Now umount can handle mount points as well as block devices. 1116 * This is important for filesystems which use unnamed block devices. 1117 * 1118 * We now support a flag for forced unmount like the other 'big iron' 1119 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1120 */ 1121 1122 asmlinkage long sys_umount(char __user * name, int flags) 1123 { 1124 struct nameidata nd; 1125 int retval; 1126 1127 retval = __user_walk(name, LOOKUP_FOLLOW, &nd); 1128 if (retval) 1129 goto out; 1130 retval = -EINVAL; 1131 if (nd.path.dentry != nd.path.mnt->mnt_root) 1132 goto dput_and_out; 1133 if (!check_mnt(nd.path.mnt)) 1134 goto dput_and_out; 1135 1136 retval = -EPERM; 1137 if (!capable(CAP_SYS_ADMIN)) 1138 goto dput_and_out; 1139 1140 retval = do_umount(nd.path.mnt, flags); 1141 dput_and_out: 1142 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1143 dput(nd.path.dentry); 1144 mntput_no_expire(nd.path.mnt); 1145 out: 1146 return retval; 1147 } 1148 1149 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1150 1151 /* 1152 * The 2.0 compatible umount. No flags. 1153 */ 1154 asmlinkage long sys_oldumount(char __user * name) 1155 { 1156 return sys_umount(name, 0); 1157 } 1158 1159 #endif 1160 1161 static int mount_is_safe(struct nameidata *nd) 1162 { 1163 if (capable(CAP_SYS_ADMIN)) 1164 return 0; 1165 return -EPERM; 1166 #ifdef notyet 1167 if (S_ISLNK(nd->path.dentry->d_inode->i_mode)) 1168 return -EPERM; 1169 if (nd->path.dentry->d_inode->i_mode & S_ISVTX) { 1170 if (current->uid != nd->path.dentry->d_inode->i_uid) 1171 return -EPERM; 1172 } 1173 if (vfs_permission(nd, MAY_WRITE)) 1174 return -EPERM; 1175 return 0; 1176 #endif 1177 } 1178 1179 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, 1180 int flag) 1181 { 1182 struct vfsmount *res, *p, *q, *r, *s; 1183 struct path path; 1184 1185 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 1186 return NULL; 1187 1188 res = q = clone_mnt(mnt, dentry, flag); 1189 if (!q) 1190 goto Enomem; 1191 q->mnt_mountpoint = mnt->mnt_mountpoint; 1192 1193 p = mnt; 1194 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1195 if (!is_subdir(r->mnt_mountpoint, dentry)) 1196 continue; 1197 1198 for (s = r; s; s = next_mnt(s, r)) { 1199 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 1200 s = skip_mnt_tree(s); 1201 continue; 1202 } 1203 while (p != s->mnt_parent) { 1204 p = p->mnt_parent; 1205 q = q->mnt_parent; 1206 } 1207 p = s; 1208 path.mnt = q; 1209 path.dentry = p->mnt_mountpoint; 1210 q = clone_mnt(p, p->mnt_root, flag); 1211 if (!q) 1212 goto Enomem; 1213 spin_lock(&vfsmount_lock); 1214 list_add_tail(&q->mnt_list, &res->mnt_list); 1215 attach_mnt(q, &path); 1216 spin_unlock(&vfsmount_lock); 1217 } 1218 } 1219 return res; 1220 Enomem: 1221 if (res) { 1222 LIST_HEAD(umount_list); 1223 spin_lock(&vfsmount_lock); 1224 umount_tree(res, 0, &umount_list); 1225 spin_unlock(&vfsmount_lock); 1226 release_mounts(&umount_list); 1227 } 1228 return NULL; 1229 } 1230 1231 struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 1232 { 1233 struct vfsmount *tree; 1234 down_write(&namespace_sem); 1235 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 1236 up_write(&namespace_sem); 1237 return tree; 1238 } 1239 1240 void drop_collected_mounts(struct vfsmount *mnt) 1241 { 1242 LIST_HEAD(umount_list); 1243 down_write(&namespace_sem); 1244 spin_lock(&vfsmount_lock); 1245 umount_tree(mnt, 0, &umount_list); 1246 spin_unlock(&vfsmount_lock); 1247 up_write(&namespace_sem); 1248 release_mounts(&umount_list); 1249 } 1250 1251 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1252 { 1253 struct vfsmount *p; 1254 1255 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1256 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1257 mnt_release_group_id(p); 1258 } 1259 } 1260 1261 static int invent_group_ids(struct vfsmount *mnt, bool recurse) 1262 { 1263 struct vfsmount *p; 1264 1265 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1266 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1267 int err = mnt_alloc_group_id(p); 1268 if (err) { 1269 cleanup_group_ids(mnt, p); 1270 return err; 1271 } 1272 } 1273 } 1274 1275 return 0; 1276 } 1277 1278 /* 1279 * @source_mnt : mount tree to be attached 1280 * @nd : place the mount tree @source_mnt is attached 1281 * @parent_nd : if non-null, detach the source_mnt from its parent and 1282 * store the parent mount and mountpoint dentry. 1283 * (done when source_mnt is moved) 1284 * 1285 * NOTE: in the table below explains the semantics when a source mount 1286 * of a given type is attached to a destination mount of a given type. 1287 * --------------------------------------------------------------------------- 1288 * | BIND MOUNT OPERATION | 1289 * |************************************************************************** 1290 * | source-->| shared | private | slave | unbindable | 1291 * | dest | | | | | 1292 * | | | | | | | 1293 * | v | | | | | 1294 * |************************************************************************** 1295 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1296 * | | | | | | 1297 * |non-shared| shared (+) | private | slave (*) | invalid | 1298 * *************************************************************************** 1299 * A bind operation clones the source mount and mounts the clone on the 1300 * destination mount. 1301 * 1302 * (++) the cloned mount is propagated to all the mounts in the propagation 1303 * tree of the destination mount and the cloned mount is added to 1304 * the peer group of the source mount. 1305 * (+) the cloned mount is created under the destination mount and is marked 1306 * as shared. The cloned mount is added to the peer group of the source 1307 * mount. 1308 * (+++) the mount is propagated to all the mounts in the propagation tree 1309 * of the destination mount and the cloned mount is made slave 1310 * of the same master as that of the source mount. The cloned mount 1311 * is marked as 'shared and slave'. 1312 * (*) the cloned mount is made a slave of the same master as that of the 1313 * source mount. 1314 * 1315 * --------------------------------------------------------------------------- 1316 * | MOVE MOUNT OPERATION | 1317 * |************************************************************************** 1318 * | source-->| shared | private | slave | unbindable | 1319 * | dest | | | | | 1320 * | | | | | | | 1321 * | v | | | | | 1322 * |************************************************************************** 1323 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1324 * | | | | | | 1325 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1326 * *************************************************************************** 1327 * 1328 * (+) the mount is moved to the destination. And is then propagated to 1329 * all the mounts in the propagation tree of the destination mount. 1330 * (+*) the mount is moved to the destination. 1331 * (+++) the mount is moved to the destination and is then propagated to 1332 * all the mounts belonging to the destination mount's propagation tree. 1333 * the mount is marked as 'shared and slave'. 1334 * (*) the mount continues to be a slave at the new location. 1335 * 1336 * if the source mount is a tree, the operations explained above is 1337 * applied to each mount in the tree. 1338 * Must be called without spinlocks held, since this function can sleep 1339 * in allocations. 1340 */ 1341 static int attach_recursive_mnt(struct vfsmount *source_mnt, 1342 struct path *path, struct path *parent_path) 1343 { 1344 LIST_HEAD(tree_list); 1345 struct vfsmount *dest_mnt = path->mnt; 1346 struct dentry *dest_dentry = path->dentry; 1347 struct vfsmount *child, *p; 1348 int err; 1349 1350 if (IS_MNT_SHARED(dest_mnt)) { 1351 err = invent_group_ids(source_mnt, true); 1352 if (err) 1353 goto out; 1354 } 1355 err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); 1356 if (err) 1357 goto out_cleanup_ids; 1358 1359 if (IS_MNT_SHARED(dest_mnt)) { 1360 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1361 set_mnt_shared(p); 1362 } 1363 1364 spin_lock(&vfsmount_lock); 1365 if (parent_path) { 1366 detach_mnt(source_mnt, parent_path); 1367 attach_mnt(source_mnt, path); 1368 touch_mnt_namespace(current->nsproxy->mnt_ns); 1369 } else { 1370 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1371 commit_tree(source_mnt); 1372 } 1373 1374 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 1375 list_del_init(&child->mnt_hash); 1376 commit_tree(child); 1377 } 1378 spin_unlock(&vfsmount_lock); 1379 return 0; 1380 1381 out_cleanup_ids: 1382 if (IS_MNT_SHARED(dest_mnt)) 1383 cleanup_group_ids(source_mnt, NULL); 1384 out: 1385 return err; 1386 } 1387 1388 static int graft_tree(struct vfsmount *mnt, struct path *path) 1389 { 1390 int err; 1391 if (mnt->mnt_sb->s_flags & MS_NOUSER) 1392 return -EINVAL; 1393 1394 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1395 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 1396 return -ENOTDIR; 1397 1398 err = -ENOENT; 1399 mutex_lock(&path->dentry->d_inode->i_mutex); 1400 if (IS_DEADDIR(path->dentry->d_inode)) 1401 goto out_unlock; 1402 1403 err = security_sb_check_sb(mnt, path); 1404 if (err) 1405 goto out_unlock; 1406 1407 err = -ENOENT; 1408 if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) 1409 err = attach_recursive_mnt(mnt, path, NULL); 1410 out_unlock: 1411 mutex_unlock(&path->dentry->d_inode->i_mutex); 1412 if (!err) 1413 security_sb_post_addmount(mnt, path); 1414 return err; 1415 } 1416 1417 /* 1418 * recursively change the type of the mountpoint. 1419 * noinline this do_mount helper to save do_mount stack space. 1420 */ 1421 static noinline int do_change_type(struct nameidata *nd, int flag) 1422 { 1423 struct vfsmount *m, *mnt = nd->path.mnt; 1424 int recurse = flag & MS_REC; 1425 int type = flag & ~MS_REC; 1426 int err = 0; 1427 1428 if (!capable(CAP_SYS_ADMIN)) 1429 return -EPERM; 1430 1431 if (nd->path.dentry != nd->path.mnt->mnt_root) 1432 return -EINVAL; 1433 1434 down_write(&namespace_sem); 1435 if (type == MS_SHARED) { 1436 err = invent_group_ids(mnt, recurse); 1437 if (err) 1438 goto out_unlock; 1439 } 1440 1441 spin_lock(&vfsmount_lock); 1442 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1443 change_mnt_propagation(m, type); 1444 spin_unlock(&vfsmount_lock); 1445 1446 out_unlock: 1447 up_write(&namespace_sem); 1448 return err; 1449 } 1450 1451 /* 1452 * do loopback mount. 1453 * noinline this do_mount helper to save do_mount stack space. 1454 */ 1455 static noinline int do_loopback(struct nameidata *nd, char *old_name, 1456 int recurse) 1457 { 1458 struct nameidata old_nd; 1459 struct vfsmount *mnt = NULL; 1460 int err = mount_is_safe(nd); 1461 if (err) 1462 return err; 1463 if (!old_name || !*old_name) 1464 return -EINVAL; 1465 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 1466 if (err) 1467 return err; 1468 1469 down_write(&namespace_sem); 1470 err = -EINVAL; 1471 if (IS_MNT_UNBINDABLE(old_nd.path.mnt)) 1472 goto out; 1473 1474 if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) 1475 goto out; 1476 1477 err = -ENOMEM; 1478 if (recurse) 1479 mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0); 1480 else 1481 mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0); 1482 1483 if (!mnt) 1484 goto out; 1485 1486 err = graft_tree(mnt, &nd->path); 1487 if (err) { 1488 LIST_HEAD(umount_list); 1489 spin_lock(&vfsmount_lock); 1490 umount_tree(mnt, 0, &umount_list); 1491 spin_unlock(&vfsmount_lock); 1492 release_mounts(&umount_list); 1493 } 1494 1495 out: 1496 up_write(&namespace_sem); 1497 path_put(&old_nd.path); 1498 return err; 1499 } 1500 1501 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 1502 { 1503 int error = 0; 1504 int readonly_request = 0; 1505 1506 if (ms_flags & MS_RDONLY) 1507 readonly_request = 1; 1508 if (readonly_request == __mnt_is_readonly(mnt)) 1509 return 0; 1510 1511 if (readonly_request) 1512 error = mnt_make_readonly(mnt); 1513 else 1514 __mnt_unmake_readonly(mnt); 1515 return error; 1516 } 1517 1518 /* 1519 * change filesystem flags. dir should be a physical root of filesystem. 1520 * If you've mounted a non-root directory somewhere and want to do remount 1521 * on it - tough luck. 1522 * noinline this do_mount helper to save do_mount stack space. 1523 */ 1524 static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, 1525 void *data) 1526 { 1527 int err; 1528 struct super_block *sb = nd->path.mnt->mnt_sb; 1529 1530 if (!capable(CAP_SYS_ADMIN)) 1531 return -EPERM; 1532 1533 if (!check_mnt(nd->path.mnt)) 1534 return -EINVAL; 1535 1536 if (nd->path.dentry != nd->path.mnt->mnt_root) 1537 return -EINVAL; 1538 1539 down_write(&sb->s_umount); 1540 if (flags & MS_BIND) 1541 err = change_mount_flags(nd->path.mnt, flags); 1542 else 1543 err = do_remount_sb(sb, flags, data, 0); 1544 if (!err) 1545 nd->path.mnt->mnt_flags = mnt_flags; 1546 up_write(&sb->s_umount); 1547 if (!err) 1548 security_sb_post_remount(nd->path.mnt, flags, data); 1549 return err; 1550 } 1551 1552 static inline int tree_contains_unbindable(struct vfsmount *mnt) 1553 { 1554 struct vfsmount *p; 1555 for (p = mnt; p; p = next_mnt(p, mnt)) { 1556 if (IS_MNT_UNBINDABLE(p)) 1557 return 1; 1558 } 1559 return 0; 1560 } 1561 1562 /* 1563 * noinline this do_mount helper to save do_mount stack space. 1564 */ 1565 static noinline int do_move_mount(struct nameidata *nd, char *old_name) 1566 { 1567 struct nameidata old_nd; 1568 struct path parent_path; 1569 struct vfsmount *p; 1570 int err = 0; 1571 if (!capable(CAP_SYS_ADMIN)) 1572 return -EPERM; 1573 if (!old_name || !*old_name) 1574 return -EINVAL; 1575 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 1576 if (err) 1577 return err; 1578 1579 down_write(&namespace_sem); 1580 while (d_mountpoint(nd->path.dentry) && 1581 follow_down(&nd->path.mnt, &nd->path.dentry)) 1582 ; 1583 err = -EINVAL; 1584 if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) 1585 goto out; 1586 1587 err = -ENOENT; 1588 mutex_lock(&nd->path.dentry->d_inode->i_mutex); 1589 if (IS_DEADDIR(nd->path.dentry->d_inode)) 1590 goto out1; 1591 1592 if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry)) 1593 goto out1; 1594 1595 err = -EINVAL; 1596 if (old_nd.path.dentry != old_nd.path.mnt->mnt_root) 1597 goto out1; 1598 1599 if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent) 1600 goto out1; 1601 1602 if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != 1603 S_ISDIR(old_nd.path.dentry->d_inode->i_mode)) 1604 goto out1; 1605 /* 1606 * Don't move a mount residing in a shared parent. 1607 */ 1608 if (old_nd.path.mnt->mnt_parent && 1609 IS_MNT_SHARED(old_nd.path.mnt->mnt_parent)) 1610 goto out1; 1611 /* 1612 * Don't move a mount tree containing unbindable mounts to a destination 1613 * mount which is shared. 1614 */ 1615 if (IS_MNT_SHARED(nd->path.mnt) && 1616 tree_contains_unbindable(old_nd.path.mnt)) 1617 goto out1; 1618 err = -ELOOP; 1619 for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent) 1620 if (p == old_nd.path.mnt) 1621 goto out1; 1622 1623 err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path); 1624 if (err) 1625 goto out1; 1626 1627 /* if the mount is moved, it should no longer be expire 1628 * automatically */ 1629 list_del_init(&old_nd.path.mnt->mnt_expire); 1630 out1: 1631 mutex_unlock(&nd->path.dentry->d_inode->i_mutex); 1632 out: 1633 up_write(&namespace_sem); 1634 if (!err) 1635 path_put(&parent_path); 1636 path_put(&old_nd.path); 1637 return err; 1638 } 1639 1640 /* 1641 * create a new mount for userspace and request it to be added into the 1642 * namespace's tree 1643 * noinline this do_mount helper to save do_mount stack space. 1644 */ 1645 static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, 1646 int mnt_flags, char *name, void *data) 1647 { 1648 struct vfsmount *mnt; 1649 1650 if (!type || !memchr(type, 0, PAGE_SIZE)) 1651 return -EINVAL; 1652 1653 /* we need capabilities... */ 1654 if (!capable(CAP_SYS_ADMIN)) 1655 return -EPERM; 1656 1657 mnt = do_kern_mount(type, flags, name, data); 1658 if (IS_ERR(mnt)) 1659 return PTR_ERR(mnt); 1660 1661 return do_add_mount(mnt, nd, mnt_flags, NULL); 1662 } 1663 1664 /* 1665 * add a mount into a namespace's mount tree 1666 * - provide the option of adding the new mount to an expiration list 1667 */ 1668 int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, 1669 int mnt_flags, struct list_head *fslist) 1670 { 1671 int err; 1672 1673 down_write(&namespace_sem); 1674 /* Something was mounted here while we slept */ 1675 while (d_mountpoint(nd->path.dentry) && 1676 follow_down(&nd->path.mnt, &nd->path.dentry)) 1677 ; 1678 err = -EINVAL; 1679 if (!check_mnt(nd->path.mnt)) 1680 goto unlock; 1681 1682 /* Refuse the same filesystem on the same mount point */ 1683 err = -EBUSY; 1684 if (nd->path.mnt->mnt_sb == newmnt->mnt_sb && 1685 nd->path.mnt->mnt_root == nd->path.dentry) 1686 goto unlock; 1687 1688 err = -EINVAL; 1689 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) 1690 goto unlock; 1691 1692 newmnt->mnt_flags = mnt_flags; 1693 if ((err = graft_tree(newmnt, &nd->path))) 1694 goto unlock; 1695 1696 if (fslist) /* add to the specified expiration list */ 1697 list_add_tail(&newmnt->mnt_expire, fslist); 1698 1699 up_write(&namespace_sem); 1700 return 0; 1701 1702 unlock: 1703 up_write(&namespace_sem); 1704 mntput(newmnt); 1705 return err; 1706 } 1707 1708 EXPORT_SYMBOL_GPL(do_add_mount); 1709 1710 /* 1711 * process a list of expirable mountpoints with the intent of discarding any 1712 * mountpoints that aren't in use and haven't been touched since last we came 1713 * here 1714 */ 1715 void mark_mounts_for_expiry(struct list_head *mounts) 1716 { 1717 struct vfsmount *mnt, *next; 1718 LIST_HEAD(graveyard); 1719 LIST_HEAD(umounts); 1720 1721 if (list_empty(mounts)) 1722 return; 1723 1724 down_write(&namespace_sem); 1725 spin_lock(&vfsmount_lock); 1726 1727 /* extract from the expiration list every vfsmount that matches the 1728 * following criteria: 1729 * - only referenced by its parent vfsmount 1730 * - still marked for expiry (marked on the last call here; marks are 1731 * cleared by mntput()) 1732 */ 1733 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 1734 if (!xchg(&mnt->mnt_expiry_mark, 1) || 1735 propagate_mount_busy(mnt, 1)) 1736 continue; 1737 list_move(&mnt->mnt_expire, &graveyard); 1738 } 1739 while (!list_empty(&graveyard)) { 1740 mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); 1741 touch_mnt_namespace(mnt->mnt_ns); 1742 umount_tree(mnt, 1, &umounts); 1743 } 1744 spin_unlock(&vfsmount_lock); 1745 up_write(&namespace_sem); 1746 1747 release_mounts(&umounts); 1748 } 1749 1750 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 1751 1752 /* 1753 * Ripoff of 'select_parent()' 1754 * 1755 * search the list of submounts for a given mountpoint, and move any 1756 * shrinkable submounts to the 'graveyard' list. 1757 */ 1758 static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) 1759 { 1760 struct vfsmount *this_parent = parent; 1761 struct list_head *next; 1762 int found = 0; 1763 1764 repeat: 1765 next = this_parent->mnt_mounts.next; 1766 resume: 1767 while (next != &this_parent->mnt_mounts) { 1768 struct list_head *tmp = next; 1769 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); 1770 1771 next = tmp->next; 1772 if (!(mnt->mnt_flags & MNT_SHRINKABLE)) 1773 continue; 1774 /* 1775 * Descend a level if the d_mounts list is non-empty. 1776 */ 1777 if (!list_empty(&mnt->mnt_mounts)) { 1778 this_parent = mnt; 1779 goto repeat; 1780 } 1781 1782 if (!propagate_mount_busy(mnt, 1)) { 1783 list_move_tail(&mnt->mnt_expire, graveyard); 1784 found++; 1785 } 1786 } 1787 /* 1788 * All done at this level ... ascend and resume the search 1789 */ 1790 if (this_parent != parent) { 1791 next = this_parent->mnt_child.next; 1792 this_parent = this_parent->mnt_parent; 1793 goto resume; 1794 } 1795 return found; 1796 } 1797 1798 /* 1799 * process a list of expirable mountpoints with the intent of discarding any 1800 * submounts of a specific parent mountpoint 1801 */ 1802 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 1803 { 1804 LIST_HEAD(graveyard); 1805 struct vfsmount *m; 1806 1807 /* extract submounts of 'mountpoint' from the expiration list */ 1808 while (select_submounts(mnt, &graveyard)) { 1809 while (!list_empty(&graveyard)) { 1810 m = list_first_entry(&graveyard, struct vfsmount, 1811 mnt_expire); 1812 touch_mnt_namespace(mnt->mnt_ns); 1813 umount_tree(mnt, 1, umounts); 1814 } 1815 } 1816 } 1817 1818 /* 1819 * Some copy_from_user() implementations do not return the exact number of 1820 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 1821 * Note that this function differs from copy_from_user() in that it will oops 1822 * on bad values of `to', rather than returning a short copy. 1823 */ 1824 static long exact_copy_from_user(void *to, const void __user * from, 1825 unsigned long n) 1826 { 1827 char *t = to; 1828 const char __user *f = from; 1829 char c; 1830 1831 if (!access_ok(VERIFY_READ, from, n)) 1832 return n; 1833 1834 while (n) { 1835 if (__get_user(c, f)) { 1836 memset(t, 0, n); 1837 break; 1838 } 1839 *t++ = c; 1840 f++; 1841 n--; 1842 } 1843 return n; 1844 } 1845 1846 int copy_mount_options(const void __user * data, unsigned long *where) 1847 { 1848 int i; 1849 unsigned long page; 1850 unsigned long size; 1851 1852 *where = 0; 1853 if (!data) 1854 return 0; 1855 1856 if (!(page = __get_free_page(GFP_KERNEL))) 1857 return -ENOMEM; 1858 1859 /* We only care that *some* data at the address the user 1860 * gave us is valid. Just in case, we'll zero 1861 * the remainder of the page. 1862 */ 1863 /* copy_from_user cannot cross TASK_SIZE ! */ 1864 size = TASK_SIZE - (unsigned long)data; 1865 if (size > PAGE_SIZE) 1866 size = PAGE_SIZE; 1867 1868 i = size - exact_copy_from_user((void *)page, data, size); 1869 if (!i) { 1870 free_page(page); 1871 return -EFAULT; 1872 } 1873 if (i != PAGE_SIZE) 1874 memset((char *)page + i, 0, PAGE_SIZE - i); 1875 *where = page; 1876 return 0; 1877 } 1878 1879 /* 1880 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 1881 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 1882 * 1883 * data is a (void *) that can point to any structure up to 1884 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 1885 * information (or be NULL). 1886 * 1887 * Pre-0.97 versions of mount() didn't have a flags word. 1888 * When the flags word was introduced its top half was required 1889 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 1890 * Therefore, if this magic number is present, it carries no information 1891 * and must be discarded. 1892 */ 1893 long do_mount(char *dev_name, char *dir_name, char *type_page, 1894 unsigned long flags, void *data_page) 1895 { 1896 struct nameidata nd; 1897 int retval = 0; 1898 int mnt_flags = 0; 1899 1900 /* Discard magic */ 1901 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 1902 flags &= ~MS_MGC_MSK; 1903 1904 /* Basic sanity checks */ 1905 1906 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 1907 return -EINVAL; 1908 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) 1909 return -EINVAL; 1910 1911 if (data_page) 1912 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1913 1914 /* Separate the per-mountpoint flags */ 1915 if (flags & MS_NOSUID) 1916 mnt_flags |= MNT_NOSUID; 1917 if (flags & MS_NODEV) 1918 mnt_flags |= MNT_NODEV; 1919 if (flags & MS_NOEXEC) 1920 mnt_flags |= MNT_NOEXEC; 1921 if (flags & MS_NOATIME) 1922 mnt_flags |= MNT_NOATIME; 1923 if (flags & MS_NODIRATIME) 1924 mnt_flags |= MNT_NODIRATIME; 1925 if (flags & MS_RELATIME) 1926 mnt_flags |= MNT_RELATIME; 1927 if (flags & MS_RDONLY) 1928 mnt_flags |= MNT_READONLY; 1929 1930 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1931 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1932 1933 /* ... and get the mountpoint */ 1934 retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); 1935 if (retval) 1936 return retval; 1937 1938 retval = security_sb_mount(dev_name, &nd.path, 1939 type_page, flags, data_page); 1940 if (retval) 1941 goto dput_out; 1942 1943 if (flags & MS_REMOUNT) 1944 retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, 1945 data_page); 1946 else if (flags & MS_BIND) 1947 retval = do_loopback(&nd, dev_name, flags & MS_REC); 1948 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1949 retval = do_change_type(&nd, flags); 1950 else if (flags & MS_MOVE) 1951 retval = do_move_mount(&nd, dev_name); 1952 else 1953 retval = do_new_mount(&nd, type_page, flags, mnt_flags, 1954 dev_name, data_page); 1955 dput_out: 1956 path_put(&nd.path); 1957 return retval; 1958 } 1959 1960 /* 1961 * Allocate a new namespace structure and populate it with contents 1962 * copied from the namespace of the passed in task structure. 1963 */ 1964 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 1965 struct fs_struct *fs) 1966 { 1967 struct mnt_namespace *new_ns; 1968 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; 1969 struct vfsmount *p, *q; 1970 1971 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1972 if (!new_ns) 1973 return ERR_PTR(-ENOMEM); 1974 1975 atomic_set(&new_ns->count, 1); 1976 INIT_LIST_HEAD(&new_ns->list); 1977 init_waitqueue_head(&new_ns->poll); 1978 new_ns->event = 0; 1979 1980 down_write(&namespace_sem); 1981 /* First pass: copy the tree topology */ 1982 new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, 1983 CL_COPY_ALL | CL_EXPIRE); 1984 if (!new_ns->root) { 1985 up_write(&namespace_sem); 1986 kfree(new_ns); 1987 return ERR_PTR(-ENOMEM);; 1988 } 1989 spin_lock(&vfsmount_lock); 1990 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1991 spin_unlock(&vfsmount_lock); 1992 1993 /* 1994 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 1995 * as belonging to new namespace. We have already acquired a private 1996 * fs_struct, so tsk->fs->lock is not needed. 1997 */ 1998 p = mnt_ns->root; 1999 q = new_ns->root; 2000 while (p) { 2001 q->mnt_ns = new_ns; 2002 if (fs) { 2003 if (p == fs->root.mnt) { 2004 rootmnt = p; 2005 fs->root.mnt = mntget(q); 2006 } 2007 if (p == fs->pwd.mnt) { 2008 pwdmnt = p; 2009 fs->pwd.mnt = mntget(q); 2010 } 2011 if (p == fs->altroot.mnt) { 2012 altrootmnt = p; 2013 fs->altroot.mnt = mntget(q); 2014 } 2015 } 2016 p = next_mnt(p, mnt_ns->root); 2017 q = next_mnt(q, new_ns->root); 2018 } 2019 up_write(&namespace_sem); 2020 2021 if (rootmnt) 2022 mntput(rootmnt); 2023 if (pwdmnt) 2024 mntput(pwdmnt); 2025 if (altrootmnt) 2026 mntput(altrootmnt); 2027 2028 return new_ns; 2029 } 2030 2031 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2032 struct fs_struct *new_fs) 2033 { 2034 struct mnt_namespace *new_ns; 2035 2036 BUG_ON(!ns); 2037 get_mnt_ns(ns); 2038 2039 if (!(flags & CLONE_NEWNS)) 2040 return ns; 2041 2042 new_ns = dup_mnt_ns(ns, new_fs); 2043 2044 put_mnt_ns(ns); 2045 return new_ns; 2046 } 2047 2048 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, 2049 char __user * type, unsigned long flags, 2050 void __user * data) 2051 { 2052 int retval; 2053 unsigned long data_page; 2054 unsigned long type_page; 2055 unsigned long dev_page; 2056 char *dir_page; 2057 2058 retval = copy_mount_options(type, &type_page); 2059 if (retval < 0) 2060 return retval; 2061 2062 dir_page = getname(dir_name); 2063 retval = PTR_ERR(dir_page); 2064 if (IS_ERR(dir_page)) 2065 goto out1; 2066 2067 retval = copy_mount_options(dev_name, &dev_page); 2068 if (retval < 0) 2069 goto out2; 2070 2071 retval = copy_mount_options(data, &data_page); 2072 if (retval < 0) 2073 goto out3; 2074 2075 lock_kernel(); 2076 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2077 flags, (void *)data_page); 2078 unlock_kernel(); 2079 free_page(data_page); 2080 2081 out3: 2082 free_page(dev_page); 2083 out2: 2084 putname(dir_page); 2085 out1: 2086 free_page(type_page); 2087 return retval; 2088 } 2089 2090 /* 2091 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 2092 * It can block. Requires the big lock held. 2093 */ 2094 void set_fs_root(struct fs_struct *fs, struct path *path) 2095 { 2096 struct path old_root; 2097 2098 write_lock(&fs->lock); 2099 old_root = fs->root; 2100 fs->root = *path; 2101 path_get(path); 2102 write_unlock(&fs->lock); 2103 if (old_root.dentry) 2104 path_put(&old_root); 2105 } 2106 2107 /* 2108 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. 2109 * It can block. Requires the big lock held. 2110 */ 2111 void set_fs_pwd(struct fs_struct *fs, struct path *path) 2112 { 2113 struct path old_pwd; 2114 2115 write_lock(&fs->lock); 2116 old_pwd = fs->pwd; 2117 fs->pwd = *path; 2118 path_get(path); 2119 write_unlock(&fs->lock); 2120 2121 if (old_pwd.dentry) 2122 path_put(&old_pwd); 2123 } 2124 2125 static void chroot_fs_refs(struct path *old_root, struct path *new_root) 2126 { 2127 struct task_struct *g, *p; 2128 struct fs_struct *fs; 2129 2130 read_lock(&tasklist_lock); 2131 do_each_thread(g, p) { 2132 task_lock(p); 2133 fs = p->fs; 2134 if (fs) { 2135 atomic_inc(&fs->count); 2136 task_unlock(p); 2137 if (fs->root.dentry == old_root->dentry 2138 && fs->root.mnt == old_root->mnt) 2139 set_fs_root(fs, new_root); 2140 if (fs->pwd.dentry == old_root->dentry 2141 && fs->pwd.mnt == old_root->mnt) 2142 set_fs_pwd(fs, new_root); 2143 put_fs_struct(fs); 2144 } else 2145 task_unlock(p); 2146 } while_each_thread(g, p); 2147 read_unlock(&tasklist_lock); 2148 } 2149 2150 /* 2151 * pivot_root Semantics: 2152 * Moves the root file system of the current process to the directory put_old, 2153 * makes new_root as the new root file system of the current process, and sets 2154 * root/cwd of all processes which had them on the current root to new_root. 2155 * 2156 * Restrictions: 2157 * The new_root and put_old must be directories, and must not be on the 2158 * same file system as the current process root. The put_old must be 2159 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2160 * pointed to by put_old must yield the same directory as new_root. No other 2161 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2162 * 2163 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2164 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2165 * in this situation. 2166 * 2167 * Notes: 2168 * - we don't move root/cwd if they are not at the root (reason: if something 2169 * cared enough to change them, it's probably wrong to force them elsewhere) 2170 * - it's okay to pick a root that isn't the root of a file system, e.g. 2171 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2172 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2173 * first. 2174 */ 2175 asmlinkage long sys_pivot_root(const char __user * new_root, 2176 const char __user * put_old) 2177 { 2178 struct vfsmount *tmp; 2179 struct nameidata new_nd, old_nd; 2180 struct path parent_path, root_parent, root; 2181 int error; 2182 2183 if (!capable(CAP_SYS_ADMIN)) 2184 return -EPERM; 2185 2186 error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 2187 &new_nd); 2188 if (error) 2189 goto out0; 2190 error = -EINVAL; 2191 if (!check_mnt(new_nd.path.mnt)) 2192 goto out1; 2193 2194 error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); 2195 if (error) 2196 goto out1; 2197 2198 error = security_sb_pivotroot(&old_nd.path, &new_nd.path); 2199 if (error) { 2200 path_put(&old_nd.path); 2201 goto out1; 2202 } 2203 2204 read_lock(¤t->fs->lock); 2205 root = current->fs->root; 2206 path_get(¤t->fs->root); 2207 read_unlock(¤t->fs->lock); 2208 down_write(&namespace_sem); 2209 mutex_lock(&old_nd.path.dentry->d_inode->i_mutex); 2210 error = -EINVAL; 2211 if (IS_MNT_SHARED(old_nd.path.mnt) || 2212 IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || 2213 IS_MNT_SHARED(root.mnt->mnt_parent)) 2214 goto out2; 2215 if (!check_mnt(root.mnt)) 2216 goto out2; 2217 error = -ENOENT; 2218 if (IS_DEADDIR(new_nd.path.dentry->d_inode)) 2219 goto out2; 2220 if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry)) 2221 goto out2; 2222 if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) 2223 goto out2; 2224 error = -EBUSY; 2225 if (new_nd.path.mnt == root.mnt || 2226 old_nd.path.mnt == root.mnt) 2227 goto out2; /* loop, on the same file system */ 2228 error = -EINVAL; 2229 if (root.mnt->mnt_root != root.dentry) 2230 goto out2; /* not a mountpoint */ 2231 if (root.mnt->mnt_parent == root.mnt) 2232 goto out2; /* not attached */ 2233 if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) 2234 goto out2; /* not a mountpoint */ 2235 if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt) 2236 goto out2; /* not attached */ 2237 /* make sure we can reach put_old from new_root */ 2238 tmp = old_nd.path.mnt; 2239 spin_lock(&vfsmount_lock); 2240 if (tmp != new_nd.path.mnt) { 2241 for (;;) { 2242 if (tmp->mnt_parent == tmp) 2243 goto out3; /* already mounted on put_old */ 2244 if (tmp->mnt_parent == new_nd.path.mnt) 2245 break; 2246 tmp = tmp->mnt_parent; 2247 } 2248 if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry)) 2249 goto out3; 2250 } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) 2251 goto out3; 2252 detach_mnt(new_nd.path.mnt, &parent_path); 2253 detach_mnt(root.mnt, &root_parent); 2254 /* mount old root on put_old */ 2255 attach_mnt(root.mnt, &old_nd.path); 2256 /* mount new_root on / */ 2257 attach_mnt(new_nd.path.mnt, &root_parent); 2258 touch_mnt_namespace(current->nsproxy->mnt_ns); 2259 spin_unlock(&vfsmount_lock); 2260 chroot_fs_refs(&root, &new_nd.path); 2261 security_sb_post_pivotroot(&root, &new_nd.path); 2262 error = 0; 2263 path_put(&root_parent); 2264 path_put(&parent_path); 2265 out2: 2266 mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); 2267 up_write(&namespace_sem); 2268 path_put(&root); 2269 path_put(&old_nd.path); 2270 out1: 2271 path_put(&new_nd.path); 2272 out0: 2273 return error; 2274 out3: 2275 spin_unlock(&vfsmount_lock); 2276 goto out2; 2277 } 2278 2279 static void __init init_mount_tree(void) 2280 { 2281 struct vfsmount *mnt; 2282 struct mnt_namespace *ns; 2283 struct path root; 2284 2285 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2286 if (IS_ERR(mnt)) 2287 panic("Can't create rootfs"); 2288 ns = kmalloc(sizeof(*ns), GFP_KERNEL); 2289 if (!ns) 2290 panic("Can't allocate initial namespace"); 2291 atomic_set(&ns->count, 1); 2292 INIT_LIST_HEAD(&ns->list); 2293 init_waitqueue_head(&ns->poll); 2294 ns->event = 0; 2295 list_add(&mnt->mnt_list, &ns->list); 2296 ns->root = mnt; 2297 mnt->mnt_ns = ns; 2298 2299 init_task.nsproxy->mnt_ns = ns; 2300 get_mnt_ns(ns); 2301 2302 root.mnt = ns->root; 2303 root.dentry = ns->root->mnt_root; 2304 2305 set_fs_pwd(current->fs, &root); 2306 set_fs_root(current->fs, &root); 2307 } 2308 2309 void __init mnt_init(void) 2310 { 2311 unsigned u; 2312 int err; 2313 2314 init_rwsem(&namespace_sem); 2315 2316 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), 2317 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2318 2319 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 2320 2321 if (!mount_hashtable) 2322 panic("Failed to allocate mount hash table\n"); 2323 2324 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 2325 2326 for (u = 0; u < HASH_SIZE; u++) 2327 INIT_LIST_HEAD(&mount_hashtable[u]); 2328 2329 err = sysfs_init(); 2330 if (err) 2331 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2332 __func__, err); 2333 fs_kobj = kobject_create_and_add("fs", NULL); 2334 if (!fs_kobj) 2335 printk(KERN_WARNING "%s: kobj create error\n", __func__); 2336 init_rootfs(); 2337 init_mount_tree(); 2338 } 2339 2340 void __put_mnt_ns(struct mnt_namespace *ns) 2341 { 2342 struct vfsmount *root = ns->root; 2343 LIST_HEAD(umount_list); 2344 ns->root = NULL; 2345 spin_unlock(&vfsmount_lock); 2346 down_write(&namespace_sem); 2347 spin_lock(&vfsmount_lock); 2348 umount_tree(root, 0, &umount_list); 2349 spin_unlock(&vfsmount_lock); 2350 up_write(&namespace_sem); 2351 release_mounts(&umount_list); 2352 kfree(ns); 2353 } 2354