1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/export.h> 13 #include <linux/capability.h> 14 #include <linux/mnt_namespace.h> 15 #include <linux/namei.h> 16 #include <linux/security.h> 17 #include <linux/idr.h> 18 #include <linux/acct.h> /* acct_auto_close_mnt */ 19 #include <linux/ramfs.h> /* init_rootfs */ 20 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 22 #include <linux/uaccess.h> 23 #include "pnode.h" 24 #include "internal.h" 25 26 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 27 #define HASH_SIZE (1UL << HASH_SHIFT) 28 29 static int event; 30 static DEFINE_IDA(mnt_id_ida); 31 static DEFINE_IDA(mnt_group_ida); 32 static DEFINE_SPINLOCK(mnt_id_lock); 33 static int mnt_id_start = 0; 34 static int mnt_group_start = 1; 35 36 static struct list_head *mount_hashtable __read_mostly; 37 static struct kmem_cache *mnt_cache __read_mostly; 38 static struct rw_semaphore namespace_sem; 39 40 /* /sys/fs */ 41 struct kobject *fs_kobj; 42 EXPORT_SYMBOL_GPL(fs_kobj); 43 44 /* 45 * vfsmount lock may be taken for read to prevent changes to the 46 * vfsmount hash, ie. during mountpoint lookups or walking back 47 * up the tree. 48 * 49 * It should be taken for write in all cases where the vfsmount 50 * tree or hash is modified or when a vfsmount structure is modified. 51 */ 52 DEFINE_BRLOCK(vfsmount_lock); 53 54 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 55 { 56 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 57 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 58 tmp = tmp + (tmp >> HASH_SHIFT); 59 return tmp & (HASH_SIZE - 1); 60 } 61 62 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 63 64 /* 65 * allocation is serialized by namespace_sem, but we need the spinlock to 66 * serialize with freeing. 67 */ 68 static int mnt_alloc_id(struct mount *mnt) 69 { 70 int res; 71 72 retry: 73 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 74 spin_lock(&mnt_id_lock); 75 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 76 if (!res) 77 mnt_id_start = mnt->mnt_id + 1; 78 spin_unlock(&mnt_id_lock); 79 if (res == -EAGAIN) 80 goto retry; 81 82 return res; 83 } 84 85 static void mnt_free_id(struct mount *mnt) 86 { 87 int id = mnt->mnt_id; 88 spin_lock(&mnt_id_lock); 89 ida_remove(&mnt_id_ida, id); 90 if (mnt_id_start > id) 91 mnt_id_start = id; 92 spin_unlock(&mnt_id_lock); 93 } 94 95 /* 96 * Allocate a new peer group ID 97 * 98 * mnt_group_ida is protected by namespace_sem 99 */ 100 static int mnt_alloc_group_id(struct mount *mnt) 101 { 102 int res; 103 104 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 105 return -ENOMEM; 106 107 res = ida_get_new_above(&mnt_group_ida, 108 mnt_group_start, 109 &mnt->mnt_group_id); 110 if (!res) 111 mnt_group_start = mnt->mnt_group_id + 1; 112 113 return res; 114 } 115 116 /* 117 * Release a peer group ID 118 */ 119 void mnt_release_group_id(struct mount *mnt) 120 { 121 int id = mnt->mnt_group_id; 122 ida_remove(&mnt_group_ida, id); 123 if (mnt_group_start > id) 124 mnt_group_start = id; 125 mnt->mnt_group_id = 0; 126 } 127 128 /* 129 * vfsmount lock must be held for read 130 */ 131 static inline void mnt_add_count(struct mount *mnt, int n) 132 { 133 #ifdef CONFIG_SMP 134 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 135 #else 136 preempt_disable(); 137 mnt->mnt_count += n; 138 preempt_enable(); 139 #endif 140 } 141 142 /* 143 * vfsmount lock must be held for write 144 */ 145 unsigned int mnt_get_count(struct mount *mnt) 146 { 147 #ifdef CONFIG_SMP 148 unsigned int count = 0; 149 int cpu; 150 151 for_each_possible_cpu(cpu) { 152 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 153 } 154 155 return count; 156 #else 157 return mnt->mnt_count; 158 #endif 159 } 160 161 static struct mount *alloc_vfsmnt(const char *name) 162 { 163 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 164 if (mnt) { 165 int err; 166 167 err = mnt_alloc_id(mnt); 168 if (err) 169 goto out_free_cache; 170 171 if (name) { 172 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 173 if (!mnt->mnt_devname) 174 goto out_free_id; 175 } 176 177 #ifdef CONFIG_SMP 178 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 179 if (!mnt->mnt_pcp) 180 goto out_free_devname; 181 182 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 183 #else 184 mnt->mnt_count = 1; 185 mnt->mnt_writers = 0; 186 #endif 187 188 INIT_LIST_HEAD(&mnt->mnt_hash); 189 INIT_LIST_HEAD(&mnt->mnt_child); 190 INIT_LIST_HEAD(&mnt->mnt_mounts); 191 INIT_LIST_HEAD(&mnt->mnt_list); 192 INIT_LIST_HEAD(&mnt->mnt_expire); 193 INIT_LIST_HEAD(&mnt->mnt_share); 194 INIT_LIST_HEAD(&mnt->mnt_slave_list); 195 INIT_LIST_HEAD(&mnt->mnt_slave); 196 #ifdef CONFIG_FSNOTIFY 197 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 198 #endif 199 } 200 return mnt; 201 202 #ifdef CONFIG_SMP 203 out_free_devname: 204 kfree(mnt->mnt_devname); 205 #endif 206 out_free_id: 207 mnt_free_id(mnt); 208 out_free_cache: 209 kmem_cache_free(mnt_cache, mnt); 210 return NULL; 211 } 212 213 /* 214 * Most r/o checks on a fs are for operations that take 215 * discrete amounts of time, like a write() or unlink(). 216 * We must keep track of when those operations start 217 * (for permission checks) and when they end, so that 218 * we can determine when writes are able to occur to 219 * a filesystem. 220 */ 221 /* 222 * __mnt_is_readonly: check whether a mount is read-only 223 * @mnt: the mount to check for its write status 224 * 225 * This shouldn't be used directly ouside of the VFS. 226 * It does not guarantee that the filesystem will stay 227 * r/w, just that it is right *now*. This can not and 228 * should not be used in place of IS_RDONLY(inode). 229 * mnt_want/drop_write() will _keep_ the filesystem 230 * r/w. 231 */ 232 int __mnt_is_readonly(struct vfsmount *mnt) 233 { 234 if (mnt->mnt_flags & MNT_READONLY) 235 return 1; 236 if (mnt->mnt_sb->s_flags & MS_RDONLY) 237 return 1; 238 return 0; 239 } 240 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 241 242 static inline void mnt_inc_writers(struct mount *mnt) 243 { 244 #ifdef CONFIG_SMP 245 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 246 #else 247 mnt->mnt_writers++; 248 #endif 249 } 250 251 static inline void mnt_dec_writers(struct mount *mnt) 252 { 253 #ifdef CONFIG_SMP 254 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 255 #else 256 mnt->mnt_writers--; 257 #endif 258 } 259 260 static unsigned int mnt_get_writers(struct mount *mnt) 261 { 262 #ifdef CONFIG_SMP 263 unsigned int count = 0; 264 int cpu; 265 266 for_each_possible_cpu(cpu) { 267 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 268 } 269 270 return count; 271 #else 272 return mnt->mnt_writers; 273 #endif 274 } 275 276 static int mnt_is_readonly(struct vfsmount *mnt) 277 { 278 if (mnt->mnt_sb->s_readonly_remount) 279 return 1; 280 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ 281 smp_rmb(); 282 return __mnt_is_readonly(mnt); 283 } 284 285 /* 286 * Most r/o & frozen checks on a fs are for operations that take discrete 287 * amounts of time, like a write() or unlink(). We must keep track of when 288 * those operations start (for permission checks) and when they end, so that we 289 * can determine when writes are able to occur to a filesystem. 290 */ 291 /** 292 * __mnt_want_write - get write access to a mount without freeze protection 293 * @m: the mount on which to take a write 294 * 295 * This tells the low-level filesystem that a write is about to be performed to 296 * it, and makes sure that writes are allowed (mnt it read-write) before 297 * returning success. This operation does not protect against filesystem being 298 * frozen. When the write operation is finished, __mnt_drop_write() must be 299 * called. This is effectively a refcount. 300 */ 301 int __mnt_want_write(struct vfsmount *m) 302 { 303 struct mount *mnt = real_mount(m); 304 int ret = 0; 305 306 preempt_disable(); 307 mnt_inc_writers(mnt); 308 /* 309 * The store to mnt_inc_writers must be visible before we pass 310 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 311 * incremented count after it has set MNT_WRITE_HOLD. 312 */ 313 smp_mb(); 314 while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 315 cpu_relax(); 316 /* 317 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 318 * be set to match its requirements. So we must not load that until 319 * MNT_WRITE_HOLD is cleared. 320 */ 321 smp_rmb(); 322 if (mnt_is_readonly(m)) { 323 mnt_dec_writers(mnt); 324 ret = -EROFS; 325 } 326 preempt_enable(); 327 328 return ret; 329 } 330 331 /** 332 * mnt_want_write - get write access to a mount 333 * @m: the mount on which to take a write 334 * 335 * This tells the low-level filesystem that a write is about to be performed to 336 * it, and makes sure that writes are allowed (mount is read-write, filesystem 337 * is not frozen) before returning success. When the write operation is 338 * finished, mnt_drop_write() must be called. This is effectively a refcount. 339 */ 340 int mnt_want_write(struct vfsmount *m) 341 { 342 int ret; 343 344 sb_start_write(m->mnt_sb); 345 ret = __mnt_want_write(m); 346 if (ret) 347 sb_end_write(m->mnt_sb); 348 return ret; 349 } 350 EXPORT_SYMBOL_GPL(mnt_want_write); 351 352 /** 353 * mnt_clone_write - get write access to a mount 354 * @mnt: the mount on which to take a write 355 * 356 * This is effectively like mnt_want_write, except 357 * it must only be used to take an extra write reference 358 * on a mountpoint that we already know has a write reference 359 * on it. This allows some optimisation. 360 * 361 * After finished, mnt_drop_write must be called as usual to 362 * drop the reference. 363 */ 364 int mnt_clone_write(struct vfsmount *mnt) 365 { 366 /* superblock may be r/o */ 367 if (__mnt_is_readonly(mnt)) 368 return -EROFS; 369 preempt_disable(); 370 mnt_inc_writers(real_mount(mnt)); 371 preempt_enable(); 372 return 0; 373 } 374 EXPORT_SYMBOL_GPL(mnt_clone_write); 375 376 /** 377 * __mnt_want_write_file - get write access to a file's mount 378 * @file: the file who's mount on which to take a write 379 * 380 * This is like __mnt_want_write, but it takes a file and can 381 * do some optimisations if the file is open for write already 382 */ 383 int __mnt_want_write_file(struct file *file) 384 { 385 struct inode *inode = file->f_dentry->d_inode; 386 387 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 388 return __mnt_want_write(file->f_path.mnt); 389 else 390 return mnt_clone_write(file->f_path.mnt); 391 } 392 393 /** 394 * mnt_want_write_file - get write access to a file's mount 395 * @file: the file who's mount on which to take a write 396 * 397 * This is like mnt_want_write, but it takes a file and can 398 * do some optimisations if the file is open for write already 399 */ 400 int mnt_want_write_file(struct file *file) 401 { 402 int ret; 403 404 sb_start_write(file->f_path.mnt->mnt_sb); 405 ret = __mnt_want_write_file(file); 406 if (ret) 407 sb_end_write(file->f_path.mnt->mnt_sb); 408 return ret; 409 } 410 EXPORT_SYMBOL_GPL(mnt_want_write_file); 411 412 /** 413 * __mnt_drop_write - give up write access to a mount 414 * @mnt: the mount on which to give up write access 415 * 416 * Tells the low-level filesystem that we are done 417 * performing writes to it. Must be matched with 418 * __mnt_want_write() call above. 419 */ 420 void __mnt_drop_write(struct vfsmount *mnt) 421 { 422 preempt_disable(); 423 mnt_dec_writers(real_mount(mnt)); 424 preempt_enable(); 425 } 426 427 /** 428 * mnt_drop_write - give up write access to a mount 429 * @mnt: the mount on which to give up write access 430 * 431 * Tells the low-level filesystem that we are done performing writes to it and 432 * also allows filesystem to be frozen again. Must be matched with 433 * mnt_want_write() call above. 434 */ 435 void mnt_drop_write(struct vfsmount *mnt) 436 { 437 __mnt_drop_write(mnt); 438 sb_end_write(mnt->mnt_sb); 439 } 440 EXPORT_SYMBOL_GPL(mnt_drop_write); 441 442 void __mnt_drop_write_file(struct file *file) 443 { 444 __mnt_drop_write(file->f_path.mnt); 445 } 446 447 void mnt_drop_write_file(struct file *file) 448 { 449 mnt_drop_write(file->f_path.mnt); 450 } 451 EXPORT_SYMBOL(mnt_drop_write_file); 452 453 static int mnt_make_readonly(struct mount *mnt) 454 { 455 int ret = 0; 456 457 br_write_lock(&vfsmount_lock); 458 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 459 /* 460 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 461 * should be visible before we do. 462 */ 463 smp_mb(); 464 465 /* 466 * With writers on hold, if this value is zero, then there are 467 * definitely no active writers (although held writers may subsequently 468 * increment the count, they'll have to wait, and decrement it after 469 * seeing MNT_READONLY). 470 * 471 * It is OK to have counter incremented on one CPU and decremented on 472 * another: the sum will add up correctly. The danger would be when we 473 * sum up each counter, if we read a counter before it is incremented, 474 * but then read another CPU's count which it has been subsequently 475 * decremented from -- we would see more decrements than we should. 476 * MNT_WRITE_HOLD protects against this scenario, because 477 * mnt_want_write first increments count, then smp_mb, then spins on 478 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 479 * we're counting up here. 480 */ 481 if (mnt_get_writers(mnt) > 0) 482 ret = -EBUSY; 483 else 484 mnt->mnt.mnt_flags |= MNT_READONLY; 485 /* 486 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 487 * that become unheld will see MNT_READONLY. 488 */ 489 smp_wmb(); 490 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 491 br_write_unlock(&vfsmount_lock); 492 return ret; 493 } 494 495 static void __mnt_unmake_readonly(struct mount *mnt) 496 { 497 br_write_lock(&vfsmount_lock); 498 mnt->mnt.mnt_flags &= ~MNT_READONLY; 499 br_write_unlock(&vfsmount_lock); 500 } 501 502 int sb_prepare_remount_readonly(struct super_block *sb) 503 { 504 struct mount *mnt; 505 int err = 0; 506 507 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ 508 if (atomic_long_read(&sb->s_remove_count)) 509 return -EBUSY; 510 511 br_write_lock(&vfsmount_lock); 512 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 513 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 514 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 515 smp_mb(); 516 if (mnt_get_writers(mnt) > 0) { 517 err = -EBUSY; 518 break; 519 } 520 } 521 } 522 if (!err && atomic_long_read(&sb->s_remove_count)) 523 err = -EBUSY; 524 525 if (!err) { 526 sb->s_readonly_remount = 1; 527 smp_wmb(); 528 } 529 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 530 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 531 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 532 } 533 br_write_unlock(&vfsmount_lock); 534 535 return err; 536 } 537 538 static void free_vfsmnt(struct mount *mnt) 539 { 540 kfree(mnt->mnt_devname); 541 mnt_free_id(mnt); 542 #ifdef CONFIG_SMP 543 free_percpu(mnt->mnt_pcp); 544 #endif 545 kmem_cache_free(mnt_cache, mnt); 546 } 547 548 /* 549 * find the first or last mount at @dentry on vfsmount @mnt depending on 550 * @dir. If @dir is set return the first mount else return the last mount. 551 * vfsmount_lock must be held for read or write. 552 */ 553 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 554 int dir) 555 { 556 struct list_head *head = mount_hashtable + hash(mnt, dentry); 557 struct list_head *tmp = head; 558 struct mount *p, *found = NULL; 559 560 for (;;) { 561 tmp = dir ? tmp->next : tmp->prev; 562 p = NULL; 563 if (tmp == head) 564 break; 565 p = list_entry(tmp, struct mount, mnt_hash); 566 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { 567 found = p; 568 break; 569 } 570 } 571 return found; 572 } 573 574 /* 575 * lookup_mnt - Return the first child mount mounted at path 576 * 577 * "First" means first mounted chronologically. If you create the 578 * following mounts: 579 * 580 * mount /dev/sda1 /mnt 581 * mount /dev/sda2 /mnt 582 * mount /dev/sda3 /mnt 583 * 584 * Then lookup_mnt() on the base /mnt dentry in the root mount will 585 * return successively the root dentry and vfsmount of /dev/sda1, then 586 * /dev/sda2, then /dev/sda3, then NULL. 587 * 588 * lookup_mnt takes a reference to the found vfsmount. 589 */ 590 struct vfsmount *lookup_mnt(struct path *path) 591 { 592 struct mount *child_mnt; 593 594 br_read_lock(&vfsmount_lock); 595 child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); 596 if (child_mnt) { 597 mnt_add_count(child_mnt, 1); 598 br_read_unlock(&vfsmount_lock); 599 return &child_mnt->mnt; 600 } else { 601 br_read_unlock(&vfsmount_lock); 602 return NULL; 603 } 604 } 605 606 static inline int check_mnt(struct mount *mnt) 607 { 608 return mnt->mnt_ns == current->nsproxy->mnt_ns; 609 } 610 611 /* 612 * vfsmount lock must be held for write 613 */ 614 static void touch_mnt_namespace(struct mnt_namespace *ns) 615 { 616 if (ns) { 617 ns->event = ++event; 618 wake_up_interruptible(&ns->poll); 619 } 620 } 621 622 /* 623 * vfsmount lock must be held for write 624 */ 625 static void __touch_mnt_namespace(struct mnt_namespace *ns) 626 { 627 if (ns && ns->event != event) { 628 ns->event = event; 629 wake_up_interruptible(&ns->poll); 630 } 631 } 632 633 /* 634 * Clear dentry's mounted state if it has no remaining mounts. 635 * vfsmount_lock must be held for write. 636 */ 637 static void dentry_reset_mounted(struct dentry *dentry) 638 { 639 unsigned u; 640 641 for (u = 0; u < HASH_SIZE; u++) { 642 struct mount *p; 643 644 list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { 645 if (p->mnt_mountpoint == dentry) 646 return; 647 } 648 } 649 spin_lock(&dentry->d_lock); 650 dentry->d_flags &= ~DCACHE_MOUNTED; 651 spin_unlock(&dentry->d_lock); 652 } 653 654 /* 655 * vfsmount lock must be held for write 656 */ 657 static void detach_mnt(struct mount *mnt, struct path *old_path) 658 { 659 old_path->dentry = mnt->mnt_mountpoint; 660 old_path->mnt = &mnt->mnt_parent->mnt; 661 mnt->mnt_parent = mnt; 662 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 663 list_del_init(&mnt->mnt_child); 664 list_del_init(&mnt->mnt_hash); 665 dentry_reset_mounted(old_path->dentry); 666 } 667 668 /* 669 * vfsmount lock must be held for write 670 */ 671 void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry, 672 struct mount *child_mnt) 673 { 674 mnt_add_count(mnt, 1); /* essentially, that's mntget */ 675 child_mnt->mnt_mountpoint = dget(dentry); 676 child_mnt->mnt_parent = mnt; 677 spin_lock(&dentry->d_lock); 678 dentry->d_flags |= DCACHE_MOUNTED; 679 spin_unlock(&dentry->d_lock); 680 } 681 682 /* 683 * vfsmount lock must be held for write 684 */ 685 static void attach_mnt(struct mount *mnt, struct path *path) 686 { 687 mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt); 688 list_add_tail(&mnt->mnt_hash, mount_hashtable + 689 hash(path->mnt, path->dentry)); 690 list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); 691 } 692 693 /* 694 * vfsmount lock must be held for write 695 */ 696 static void commit_tree(struct mount *mnt) 697 { 698 struct mount *parent = mnt->mnt_parent; 699 struct mount *m; 700 LIST_HEAD(head); 701 struct mnt_namespace *n = parent->mnt_ns; 702 703 BUG_ON(parent == mnt); 704 705 list_add_tail(&head, &mnt->mnt_list); 706 list_for_each_entry(m, &head, mnt_list) 707 m->mnt_ns = n; 708 709 list_splice(&head, n->list.prev); 710 711 list_add_tail(&mnt->mnt_hash, mount_hashtable + 712 hash(&parent->mnt, mnt->mnt_mountpoint)); 713 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 714 touch_mnt_namespace(n); 715 } 716 717 static struct mount *next_mnt(struct mount *p, struct mount *root) 718 { 719 struct list_head *next = p->mnt_mounts.next; 720 if (next == &p->mnt_mounts) { 721 while (1) { 722 if (p == root) 723 return NULL; 724 next = p->mnt_child.next; 725 if (next != &p->mnt_parent->mnt_mounts) 726 break; 727 p = p->mnt_parent; 728 } 729 } 730 return list_entry(next, struct mount, mnt_child); 731 } 732 733 static struct mount *skip_mnt_tree(struct mount *p) 734 { 735 struct list_head *prev = p->mnt_mounts.prev; 736 while (prev != &p->mnt_mounts) { 737 p = list_entry(prev, struct mount, mnt_child); 738 prev = p->mnt_mounts.prev; 739 } 740 return p; 741 } 742 743 struct vfsmount * 744 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 745 { 746 struct mount *mnt; 747 struct dentry *root; 748 749 if (!type) 750 return ERR_PTR(-ENODEV); 751 752 mnt = alloc_vfsmnt(name); 753 if (!mnt) 754 return ERR_PTR(-ENOMEM); 755 756 if (flags & MS_KERNMOUNT) 757 mnt->mnt.mnt_flags = MNT_INTERNAL; 758 759 root = mount_fs(type, flags, name, data); 760 if (IS_ERR(root)) { 761 free_vfsmnt(mnt); 762 return ERR_CAST(root); 763 } 764 765 mnt->mnt.mnt_root = root; 766 mnt->mnt.mnt_sb = root->d_sb; 767 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 768 mnt->mnt_parent = mnt; 769 br_write_lock(&vfsmount_lock); 770 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 771 br_write_unlock(&vfsmount_lock); 772 return &mnt->mnt; 773 } 774 EXPORT_SYMBOL_GPL(vfs_kern_mount); 775 776 static struct mount *clone_mnt(struct mount *old, struct dentry *root, 777 int flag) 778 { 779 struct super_block *sb = old->mnt.mnt_sb; 780 struct mount *mnt; 781 int err; 782 783 mnt = alloc_vfsmnt(old->mnt_devname); 784 if (!mnt) 785 return ERR_PTR(-ENOMEM); 786 787 if (flag & (CL_SLAVE | CL_PRIVATE)) 788 mnt->mnt_group_id = 0; /* not a peer of original */ 789 else 790 mnt->mnt_group_id = old->mnt_group_id; 791 792 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 793 err = mnt_alloc_group_id(mnt); 794 if (err) 795 goto out_free; 796 } 797 798 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; 799 atomic_inc(&sb->s_active); 800 mnt->mnt.mnt_sb = sb; 801 mnt->mnt.mnt_root = dget(root); 802 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 803 mnt->mnt_parent = mnt; 804 br_write_lock(&vfsmount_lock); 805 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 806 br_write_unlock(&vfsmount_lock); 807 808 if (flag & CL_SLAVE) { 809 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 810 mnt->mnt_master = old; 811 CLEAR_MNT_SHARED(mnt); 812 } else if (!(flag & CL_PRIVATE)) { 813 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 814 list_add(&mnt->mnt_share, &old->mnt_share); 815 if (IS_MNT_SLAVE(old)) 816 list_add(&mnt->mnt_slave, &old->mnt_slave); 817 mnt->mnt_master = old->mnt_master; 818 } 819 if (flag & CL_MAKE_SHARED) 820 set_mnt_shared(mnt); 821 822 /* stick the duplicate mount on the same expiry list 823 * as the original if that was on one */ 824 if (flag & CL_EXPIRE) { 825 if (!list_empty(&old->mnt_expire)) 826 list_add(&mnt->mnt_expire, &old->mnt_expire); 827 } 828 829 return mnt; 830 831 out_free: 832 free_vfsmnt(mnt); 833 return ERR_PTR(err); 834 } 835 836 static inline void mntfree(struct mount *mnt) 837 { 838 struct vfsmount *m = &mnt->mnt; 839 struct super_block *sb = m->mnt_sb; 840 841 /* 842 * This probably indicates that somebody messed 843 * up a mnt_want/drop_write() pair. If this 844 * happens, the filesystem was probably unable 845 * to make r/w->r/o transitions. 846 */ 847 /* 848 * The locking used to deal with mnt_count decrement provides barriers, 849 * so mnt_get_writers() below is safe. 850 */ 851 WARN_ON(mnt_get_writers(mnt)); 852 fsnotify_vfsmount_delete(m); 853 dput(m->mnt_root); 854 free_vfsmnt(mnt); 855 deactivate_super(sb); 856 } 857 858 static void mntput_no_expire(struct mount *mnt) 859 { 860 put_again: 861 #ifdef CONFIG_SMP 862 br_read_lock(&vfsmount_lock); 863 if (likely(mnt->mnt_ns)) { 864 /* shouldn't be the last one */ 865 mnt_add_count(mnt, -1); 866 br_read_unlock(&vfsmount_lock); 867 return; 868 } 869 br_read_unlock(&vfsmount_lock); 870 871 br_write_lock(&vfsmount_lock); 872 mnt_add_count(mnt, -1); 873 if (mnt_get_count(mnt)) { 874 br_write_unlock(&vfsmount_lock); 875 return; 876 } 877 #else 878 mnt_add_count(mnt, -1); 879 if (likely(mnt_get_count(mnt))) 880 return; 881 br_write_lock(&vfsmount_lock); 882 #endif 883 if (unlikely(mnt->mnt_pinned)) { 884 mnt_add_count(mnt, mnt->mnt_pinned + 1); 885 mnt->mnt_pinned = 0; 886 br_write_unlock(&vfsmount_lock); 887 acct_auto_close_mnt(&mnt->mnt); 888 goto put_again; 889 } 890 891 list_del(&mnt->mnt_instance); 892 br_write_unlock(&vfsmount_lock); 893 mntfree(mnt); 894 } 895 896 void mntput(struct vfsmount *mnt) 897 { 898 if (mnt) { 899 struct mount *m = real_mount(mnt); 900 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 901 if (unlikely(m->mnt_expiry_mark)) 902 m->mnt_expiry_mark = 0; 903 mntput_no_expire(m); 904 } 905 } 906 EXPORT_SYMBOL(mntput); 907 908 struct vfsmount *mntget(struct vfsmount *mnt) 909 { 910 if (mnt) 911 mnt_add_count(real_mount(mnt), 1); 912 return mnt; 913 } 914 EXPORT_SYMBOL(mntget); 915 916 void mnt_pin(struct vfsmount *mnt) 917 { 918 br_write_lock(&vfsmount_lock); 919 real_mount(mnt)->mnt_pinned++; 920 br_write_unlock(&vfsmount_lock); 921 } 922 EXPORT_SYMBOL(mnt_pin); 923 924 void mnt_unpin(struct vfsmount *m) 925 { 926 struct mount *mnt = real_mount(m); 927 br_write_lock(&vfsmount_lock); 928 if (mnt->mnt_pinned) { 929 mnt_add_count(mnt, 1); 930 mnt->mnt_pinned--; 931 } 932 br_write_unlock(&vfsmount_lock); 933 } 934 EXPORT_SYMBOL(mnt_unpin); 935 936 static inline void mangle(struct seq_file *m, const char *s) 937 { 938 seq_escape(m, s, " \t\n\\"); 939 } 940 941 /* 942 * Simple .show_options callback for filesystems which don't want to 943 * implement more complex mount option showing. 944 * 945 * See also save_mount_options(). 946 */ 947 int generic_show_options(struct seq_file *m, struct dentry *root) 948 { 949 const char *options; 950 951 rcu_read_lock(); 952 options = rcu_dereference(root->d_sb->s_options); 953 954 if (options != NULL && options[0]) { 955 seq_putc(m, ','); 956 mangle(m, options); 957 } 958 rcu_read_unlock(); 959 960 return 0; 961 } 962 EXPORT_SYMBOL(generic_show_options); 963 964 /* 965 * If filesystem uses generic_show_options(), this function should be 966 * called from the fill_super() callback. 967 * 968 * The .remount_fs callback usually needs to be handled in a special 969 * way, to make sure, that previous options are not overwritten if the 970 * remount fails. 971 * 972 * Also note, that if the filesystem's .remount_fs function doesn't 973 * reset all options to their default value, but changes only newly 974 * given options, then the displayed options will not reflect reality 975 * any more. 976 */ 977 void save_mount_options(struct super_block *sb, char *options) 978 { 979 BUG_ON(sb->s_options); 980 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 981 } 982 EXPORT_SYMBOL(save_mount_options); 983 984 void replace_mount_options(struct super_block *sb, char *options) 985 { 986 char *old = sb->s_options; 987 rcu_assign_pointer(sb->s_options, options); 988 if (old) { 989 synchronize_rcu(); 990 kfree(old); 991 } 992 } 993 EXPORT_SYMBOL(replace_mount_options); 994 995 #ifdef CONFIG_PROC_FS 996 /* iterator; we want it to have access to namespace_sem, thus here... */ 997 static void *m_start(struct seq_file *m, loff_t *pos) 998 { 999 struct proc_mounts *p = proc_mounts(m); 1000 1001 down_read(&namespace_sem); 1002 return seq_list_start(&p->ns->list, *pos); 1003 } 1004 1005 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1006 { 1007 struct proc_mounts *p = proc_mounts(m); 1008 1009 return seq_list_next(v, &p->ns->list, pos); 1010 } 1011 1012 static void m_stop(struct seq_file *m, void *v) 1013 { 1014 up_read(&namespace_sem); 1015 } 1016 1017 static int m_show(struct seq_file *m, void *v) 1018 { 1019 struct proc_mounts *p = proc_mounts(m); 1020 struct mount *r = list_entry(v, struct mount, mnt_list); 1021 return p->show(m, &r->mnt); 1022 } 1023 1024 const struct seq_operations mounts_op = { 1025 .start = m_start, 1026 .next = m_next, 1027 .stop = m_stop, 1028 .show = m_show, 1029 }; 1030 #endif /* CONFIG_PROC_FS */ 1031 1032 /** 1033 * may_umount_tree - check if a mount tree is busy 1034 * @mnt: root of mount tree 1035 * 1036 * This is called to check if a tree of mounts has any 1037 * open files, pwds, chroots or sub mounts that are 1038 * busy. 1039 */ 1040 int may_umount_tree(struct vfsmount *m) 1041 { 1042 struct mount *mnt = real_mount(m); 1043 int actual_refs = 0; 1044 int minimum_refs = 0; 1045 struct mount *p; 1046 BUG_ON(!m); 1047 1048 /* write lock needed for mnt_get_count */ 1049 br_write_lock(&vfsmount_lock); 1050 for (p = mnt; p; p = next_mnt(p, mnt)) { 1051 actual_refs += mnt_get_count(p); 1052 minimum_refs += 2; 1053 } 1054 br_write_unlock(&vfsmount_lock); 1055 1056 if (actual_refs > minimum_refs) 1057 return 0; 1058 1059 return 1; 1060 } 1061 1062 EXPORT_SYMBOL(may_umount_tree); 1063 1064 /** 1065 * may_umount - check if a mount point is busy 1066 * @mnt: root of mount 1067 * 1068 * This is called to check if a mount point has any 1069 * open files, pwds, chroots or sub mounts. If the 1070 * mount has sub mounts this will return busy 1071 * regardless of whether the sub mounts are busy. 1072 * 1073 * Doesn't take quota and stuff into account. IOW, in some cases it will 1074 * give false negatives. The main reason why it's here is that we need 1075 * a non-destructive way to look for easily umountable filesystems. 1076 */ 1077 int may_umount(struct vfsmount *mnt) 1078 { 1079 int ret = 1; 1080 down_read(&namespace_sem); 1081 br_write_lock(&vfsmount_lock); 1082 if (propagate_mount_busy(real_mount(mnt), 2)) 1083 ret = 0; 1084 br_write_unlock(&vfsmount_lock); 1085 up_read(&namespace_sem); 1086 return ret; 1087 } 1088 1089 EXPORT_SYMBOL(may_umount); 1090 1091 void release_mounts(struct list_head *head) 1092 { 1093 struct mount *mnt; 1094 while (!list_empty(head)) { 1095 mnt = list_first_entry(head, struct mount, mnt_hash); 1096 list_del_init(&mnt->mnt_hash); 1097 if (mnt_has_parent(mnt)) { 1098 struct dentry *dentry; 1099 struct mount *m; 1100 1101 br_write_lock(&vfsmount_lock); 1102 dentry = mnt->mnt_mountpoint; 1103 m = mnt->mnt_parent; 1104 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 1105 mnt->mnt_parent = mnt; 1106 m->mnt_ghosts--; 1107 br_write_unlock(&vfsmount_lock); 1108 dput(dentry); 1109 mntput(&m->mnt); 1110 } 1111 mntput(&mnt->mnt); 1112 } 1113 } 1114 1115 /* 1116 * vfsmount lock must be held for write 1117 * namespace_sem must be held for write 1118 */ 1119 void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) 1120 { 1121 LIST_HEAD(tmp_list); 1122 struct mount *p; 1123 1124 for (p = mnt; p; p = next_mnt(p, mnt)) 1125 list_move(&p->mnt_hash, &tmp_list); 1126 1127 if (propagate) 1128 propagate_umount(&tmp_list); 1129 1130 list_for_each_entry(p, &tmp_list, mnt_hash) { 1131 list_del_init(&p->mnt_expire); 1132 list_del_init(&p->mnt_list); 1133 __touch_mnt_namespace(p->mnt_ns); 1134 p->mnt_ns = NULL; 1135 list_del_init(&p->mnt_child); 1136 if (mnt_has_parent(p)) { 1137 p->mnt_parent->mnt_ghosts++; 1138 dentry_reset_mounted(p->mnt_mountpoint); 1139 } 1140 change_mnt_propagation(p, MS_PRIVATE); 1141 } 1142 list_splice(&tmp_list, kill); 1143 } 1144 1145 static void shrink_submounts(struct mount *mnt, struct list_head *umounts); 1146 1147 static int do_umount(struct mount *mnt, int flags) 1148 { 1149 struct super_block *sb = mnt->mnt.mnt_sb; 1150 int retval; 1151 LIST_HEAD(umount_list); 1152 1153 retval = security_sb_umount(&mnt->mnt, flags); 1154 if (retval) 1155 return retval; 1156 1157 /* 1158 * Allow userspace to request a mountpoint be expired rather than 1159 * unmounting unconditionally. Unmount only happens if: 1160 * (1) the mark is already set (the mark is cleared by mntput()) 1161 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1162 */ 1163 if (flags & MNT_EXPIRE) { 1164 if (&mnt->mnt == current->fs->root.mnt || 1165 flags & (MNT_FORCE | MNT_DETACH)) 1166 return -EINVAL; 1167 1168 /* 1169 * probably don't strictly need the lock here if we examined 1170 * all race cases, but it's a slowpath. 1171 */ 1172 br_write_lock(&vfsmount_lock); 1173 if (mnt_get_count(mnt) != 2) { 1174 br_write_unlock(&vfsmount_lock); 1175 return -EBUSY; 1176 } 1177 br_write_unlock(&vfsmount_lock); 1178 1179 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1180 return -EAGAIN; 1181 } 1182 1183 /* 1184 * If we may have to abort operations to get out of this 1185 * mount, and they will themselves hold resources we must 1186 * allow the fs to do things. In the Unix tradition of 1187 * 'Gee thats tricky lets do it in userspace' the umount_begin 1188 * might fail to complete on the first run through as other tasks 1189 * must return, and the like. Thats for the mount program to worry 1190 * about for the moment. 1191 */ 1192 1193 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1194 sb->s_op->umount_begin(sb); 1195 } 1196 1197 /* 1198 * No sense to grab the lock for this test, but test itself looks 1199 * somewhat bogus. Suggestions for better replacement? 1200 * Ho-hum... In principle, we might treat that as umount + switch 1201 * to rootfs. GC would eventually take care of the old vfsmount. 1202 * Actually it makes sense, especially if rootfs would contain a 1203 * /reboot - static binary that would close all descriptors and 1204 * call reboot(9). Then init(8) could umount root and exec /reboot. 1205 */ 1206 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1207 /* 1208 * Special case for "unmounting" root ... 1209 * we just try to remount it readonly. 1210 */ 1211 down_write(&sb->s_umount); 1212 if (!(sb->s_flags & MS_RDONLY)) 1213 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1214 up_write(&sb->s_umount); 1215 return retval; 1216 } 1217 1218 down_write(&namespace_sem); 1219 br_write_lock(&vfsmount_lock); 1220 event++; 1221 1222 if (!(flags & MNT_DETACH)) 1223 shrink_submounts(mnt, &umount_list); 1224 1225 retval = -EBUSY; 1226 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 1227 if (!list_empty(&mnt->mnt_list)) 1228 umount_tree(mnt, 1, &umount_list); 1229 retval = 0; 1230 } 1231 br_write_unlock(&vfsmount_lock); 1232 up_write(&namespace_sem); 1233 release_mounts(&umount_list); 1234 return retval; 1235 } 1236 1237 /* 1238 * Now umount can handle mount points as well as block devices. 1239 * This is important for filesystems which use unnamed block devices. 1240 * 1241 * We now support a flag for forced unmount like the other 'big iron' 1242 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1243 */ 1244 1245 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1246 { 1247 struct path path; 1248 struct mount *mnt; 1249 int retval; 1250 int lookup_flags = 0; 1251 1252 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1253 return -EINVAL; 1254 1255 if (!(flags & UMOUNT_NOFOLLOW)) 1256 lookup_flags |= LOOKUP_FOLLOW; 1257 1258 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); 1259 if (retval) 1260 goto out; 1261 mnt = real_mount(path.mnt); 1262 retval = -EINVAL; 1263 if (path.dentry != path.mnt->mnt_root) 1264 goto dput_and_out; 1265 if (!check_mnt(mnt)) 1266 goto dput_and_out; 1267 1268 retval = -EPERM; 1269 if (!capable(CAP_SYS_ADMIN)) 1270 goto dput_and_out; 1271 1272 retval = do_umount(mnt, flags); 1273 dput_and_out: 1274 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1275 dput(path.dentry); 1276 mntput_no_expire(mnt); 1277 out: 1278 return retval; 1279 } 1280 1281 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1282 1283 /* 1284 * The 2.0 compatible umount. No flags. 1285 */ 1286 SYSCALL_DEFINE1(oldumount, char __user *, name) 1287 { 1288 return sys_umount(name, 0); 1289 } 1290 1291 #endif 1292 1293 static int mount_is_safe(struct path *path) 1294 { 1295 if (capable(CAP_SYS_ADMIN)) 1296 return 0; 1297 return -EPERM; 1298 #ifdef notyet 1299 if (S_ISLNK(path->dentry->d_inode->i_mode)) 1300 return -EPERM; 1301 if (path->dentry->d_inode->i_mode & S_ISVTX) { 1302 if (current_uid() != path->dentry->d_inode->i_uid) 1303 return -EPERM; 1304 } 1305 if (inode_permission(path->dentry->d_inode, MAY_WRITE)) 1306 return -EPERM; 1307 return 0; 1308 #endif 1309 } 1310 1311 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1312 int flag) 1313 { 1314 struct mount *res, *p, *q, *r; 1315 struct path path; 1316 1317 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 1318 return ERR_PTR(-EINVAL); 1319 1320 res = q = clone_mnt(mnt, dentry, flag); 1321 if (IS_ERR(q)) 1322 return q; 1323 1324 q->mnt_mountpoint = mnt->mnt_mountpoint; 1325 1326 p = mnt; 1327 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1328 struct mount *s; 1329 if (!is_subdir(r->mnt_mountpoint, dentry)) 1330 continue; 1331 1332 for (s = r; s; s = next_mnt(s, r)) { 1333 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 1334 s = skip_mnt_tree(s); 1335 continue; 1336 } 1337 while (p != s->mnt_parent) { 1338 p = p->mnt_parent; 1339 q = q->mnt_parent; 1340 } 1341 p = s; 1342 path.mnt = &q->mnt; 1343 path.dentry = p->mnt_mountpoint; 1344 q = clone_mnt(p, p->mnt.mnt_root, flag); 1345 if (IS_ERR(q)) 1346 goto out; 1347 br_write_lock(&vfsmount_lock); 1348 list_add_tail(&q->mnt_list, &res->mnt_list); 1349 attach_mnt(q, &path); 1350 br_write_unlock(&vfsmount_lock); 1351 } 1352 } 1353 return res; 1354 out: 1355 if (res) { 1356 LIST_HEAD(umount_list); 1357 br_write_lock(&vfsmount_lock); 1358 umount_tree(res, 0, &umount_list); 1359 br_write_unlock(&vfsmount_lock); 1360 release_mounts(&umount_list); 1361 } 1362 return q; 1363 } 1364 1365 /* Caller should check returned pointer for errors */ 1366 1367 struct vfsmount *collect_mounts(struct path *path) 1368 { 1369 struct mount *tree; 1370 down_write(&namespace_sem); 1371 tree = copy_tree(real_mount(path->mnt), path->dentry, 1372 CL_COPY_ALL | CL_PRIVATE); 1373 up_write(&namespace_sem); 1374 if (IS_ERR(tree)) 1375 return NULL; 1376 return &tree->mnt; 1377 } 1378 1379 void drop_collected_mounts(struct vfsmount *mnt) 1380 { 1381 LIST_HEAD(umount_list); 1382 down_write(&namespace_sem); 1383 br_write_lock(&vfsmount_lock); 1384 umount_tree(real_mount(mnt), 0, &umount_list); 1385 br_write_unlock(&vfsmount_lock); 1386 up_write(&namespace_sem); 1387 release_mounts(&umount_list); 1388 } 1389 1390 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1391 struct vfsmount *root) 1392 { 1393 struct mount *mnt; 1394 int res = f(root, arg); 1395 if (res) 1396 return res; 1397 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { 1398 res = f(&mnt->mnt, arg); 1399 if (res) 1400 return res; 1401 } 1402 return 0; 1403 } 1404 1405 static void cleanup_group_ids(struct mount *mnt, struct mount *end) 1406 { 1407 struct mount *p; 1408 1409 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1410 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1411 mnt_release_group_id(p); 1412 } 1413 } 1414 1415 static int invent_group_ids(struct mount *mnt, bool recurse) 1416 { 1417 struct mount *p; 1418 1419 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1420 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1421 int err = mnt_alloc_group_id(p); 1422 if (err) { 1423 cleanup_group_ids(mnt, p); 1424 return err; 1425 } 1426 } 1427 } 1428 1429 return 0; 1430 } 1431 1432 /* 1433 * @source_mnt : mount tree to be attached 1434 * @nd : place the mount tree @source_mnt is attached 1435 * @parent_nd : if non-null, detach the source_mnt from its parent and 1436 * store the parent mount and mountpoint dentry. 1437 * (done when source_mnt is moved) 1438 * 1439 * NOTE: in the table below explains the semantics when a source mount 1440 * of a given type is attached to a destination mount of a given type. 1441 * --------------------------------------------------------------------------- 1442 * | BIND MOUNT OPERATION | 1443 * |************************************************************************** 1444 * | source-->| shared | private | slave | unbindable | 1445 * | dest | | | | | 1446 * | | | | | | | 1447 * | v | | | | | 1448 * |************************************************************************** 1449 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1450 * | | | | | | 1451 * |non-shared| shared (+) | private | slave (*) | invalid | 1452 * *************************************************************************** 1453 * A bind operation clones the source mount and mounts the clone on the 1454 * destination mount. 1455 * 1456 * (++) the cloned mount is propagated to all the mounts in the propagation 1457 * tree of the destination mount and the cloned mount is added to 1458 * the peer group of the source mount. 1459 * (+) the cloned mount is created under the destination mount and is marked 1460 * as shared. The cloned mount is added to the peer group of the source 1461 * mount. 1462 * (+++) the mount is propagated to all the mounts in the propagation tree 1463 * of the destination mount and the cloned mount is made slave 1464 * of the same master as that of the source mount. The cloned mount 1465 * is marked as 'shared and slave'. 1466 * (*) the cloned mount is made a slave of the same master as that of the 1467 * source mount. 1468 * 1469 * --------------------------------------------------------------------------- 1470 * | MOVE MOUNT OPERATION | 1471 * |************************************************************************** 1472 * | source-->| shared | private | slave | unbindable | 1473 * | dest | | | | | 1474 * | | | | | | | 1475 * | v | | | | | 1476 * |************************************************************************** 1477 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1478 * | | | | | | 1479 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1480 * *************************************************************************** 1481 * 1482 * (+) the mount is moved to the destination. And is then propagated to 1483 * all the mounts in the propagation tree of the destination mount. 1484 * (+*) the mount is moved to the destination. 1485 * (+++) the mount is moved to the destination and is then propagated to 1486 * all the mounts belonging to the destination mount's propagation tree. 1487 * the mount is marked as 'shared and slave'. 1488 * (*) the mount continues to be a slave at the new location. 1489 * 1490 * if the source mount is a tree, the operations explained above is 1491 * applied to each mount in the tree. 1492 * Must be called without spinlocks held, since this function can sleep 1493 * in allocations. 1494 */ 1495 static int attach_recursive_mnt(struct mount *source_mnt, 1496 struct path *path, struct path *parent_path) 1497 { 1498 LIST_HEAD(tree_list); 1499 struct mount *dest_mnt = real_mount(path->mnt); 1500 struct dentry *dest_dentry = path->dentry; 1501 struct mount *child, *p; 1502 int err; 1503 1504 if (IS_MNT_SHARED(dest_mnt)) { 1505 err = invent_group_ids(source_mnt, true); 1506 if (err) 1507 goto out; 1508 } 1509 err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); 1510 if (err) 1511 goto out_cleanup_ids; 1512 1513 br_write_lock(&vfsmount_lock); 1514 1515 if (IS_MNT_SHARED(dest_mnt)) { 1516 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1517 set_mnt_shared(p); 1518 } 1519 if (parent_path) { 1520 detach_mnt(source_mnt, parent_path); 1521 attach_mnt(source_mnt, path); 1522 touch_mnt_namespace(source_mnt->mnt_ns); 1523 } else { 1524 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1525 commit_tree(source_mnt); 1526 } 1527 1528 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 1529 list_del_init(&child->mnt_hash); 1530 commit_tree(child); 1531 } 1532 br_write_unlock(&vfsmount_lock); 1533 1534 return 0; 1535 1536 out_cleanup_ids: 1537 if (IS_MNT_SHARED(dest_mnt)) 1538 cleanup_group_ids(source_mnt, NULL); 1539 out: 1540 return err; 1541 } 1542 1543 static int lock_mount(struct path *path) 1544 { 1545 struct vfsmount *mnt; 1546 retry: 1547 mutex_lock(&path->dentry->d_inode->i_mutex); 1548 if (unlikely(cant_mount(path->dentry))) { 1549 mutex_unlock(&path->dentry->d_inode->i_mutex); 1550 return -ENOENT; 1551 } 1552 down_write(&namespace_sem); 1553 mnt = lookup_mnt(path); 1554 if (likely(!mnt)) 1555 return 0; 1556 up_write(&namespace_sem); 1557 mutex_unlock(&path->dentry->d_inode->i_mutex); 1558 path_put(path); 1559 path->mnt = mnt; 1560 path->dentry = dget(mnt->mnt_root); 1561 goto retry; 1562 } 1563 1564 static void unlock_mount(struct path *path) 1565 { 1566 up_write(&namespace_sem); 1567 mutex_unlock(&path->dentry->d_inode->i_mutex); 1568 } 1569 1570 static int graft_tree(struct mount *mnt, struct path *path) 1571 { 1572 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) 1573 return -EINVAL; 1574 1575 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1576 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) 1577 return -ENOTDIR; 1578 1579 if (d_unlinked(path->dentry)) 1580 return -ENOENT; 1581 1582 return attach_recursive_mnt(mnt, path, NULL); 1583 } 1584 1585 /* 1586 * Sanity check the flags to change_mnt_propagation. 1587 */ 1588 1589 static int flags_to_propagation_type(int flags) 1590 { 1591 int type = flags & ~(MS_REC | MS_SILENT); 1592 1593 /* Fail if any non-propagation flags are set */ 1594 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1595 return 0; 1596 /* Only one propagation flag should be set */ 1597 if (!is_power_of_2(type)) 1598 return 0; 1599 return type; 1600 } 1601 1602 /* 1603 * recursively change the type of the mountpoint. 1604 */ 1605 static int do_change_type(struct path *path, int flag) 1606 { 1607 struct mount *m; 1608 struct mount *mnt = real_mount(path->mnt); 1609 int recurse = flag & MS_REC; 1610 int type; 1611 int err = 0; 1612 1613 if (!capable(CAP_SYS_ADMIN)) 1614 return -EPERM; 1615 1616 if (path->dentry != path->mnt->mnt_root) 1617 return -EINVAL; 1618 1619 type = flags_to_propagation_type(flag); 1620 if (!type) 1621 return -EINVAL; 1622 1623 down_write(&namespace_sem); 1624 if (type == MS_SHARED) { 1625 err = invent_group_ids(mnt, recurse); 1626 if (err) 1627 goto out_unlock; 1628 } 1629 1630 br_write_lock(&vfsmount_lock); 1631 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1632 change_mnt_propagation(m, type); 1633 br_write_unlock(&vfsmount_lock); 1634 1635 out_unlock: 1636 up_write(&namespace_sem); 1637 return err; 1638 } 1639 1640 /* 1641 * do loopback mount. 1642 */ 1643 static int do_loopback(struct path *path, const char *old_name, 1644 int recurse) 1645 { 1646 LIST_HEAD(umount_list); 1647 struct path old_path; 1648 struct mount *mnt = NULL, *old; 1649 int err = mount_is_safe(path); 1650 if (err) 1651 return err; 1652 if (!old_name || !*old_name) 1653 return -EINVAL; 1654 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 1655 if (err) 1656 return err; 1657 1658 err = lock_mount(path); 1659 if (err) 1660 goto out; 1661 1662 old = real_mount(old_path.mnt); 1663 1664 err = -EINVAL; 1665 if (IS_MNT_UNBINDABLE(old)) 1666 goto out2; 1667 1668 if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) 1669 goto out2; 1670 1671 if (recurse) 1672 mnt = copy_tree(old, old_path.dentry, 0); 1673 else 1674 mnt = clone_mnt(old, old_path.dentry, 0); 1675 1676 if (IS_ERR(mnt)) { 1677 err = PTR_ERR(mnt); 1678 goto out; 1679 } 1680 1681 err = graft_tree(mnt, path); 1682 if (err) { 1683 br_write_lock(&vfsmount_lock); 1684 umount_tree(mnt, 0, &umount_list); 1685 br_write_unlock(&vfsmount_lock); 1686 } 1687 out2: 1688 unlock_mount(path); 1689 release_mounts(&umount_list); 1690 out: 1691 path_put(&old_path); 1692 return err; 1693 } 1694 1695 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 1696 { 1697 int error = 0; 1698 int readonly_request = 0; 1699 1700 if (ms_flags & MS_RDONLY) 1701 readonly_request = 1; 1702 if (readonly_request == __mnt_is_readonly(mnt)) 1703 return 0; 1704 1705 if (readonly_request) 1706 error = mnt_make_readonly(real_mount(mnt)); 1707 else 1708 __mnt_unmake_readonly(real_mount(mnt)); 1709 return error; 1710 } 1711 1712 /* 1713 * change filesystem flags. dir should be a physical root of filesystem. 1714 * If you've mounted a non-root directory somewhere and want to do remount 1715 * on it - tough luck. 1716 */ 1717 static int do_remount(struct path *path, int flags, int mnt_flags, 1718 void *data) 1719 { 1720 int err; 1721 struct super_block *sb = path->mnt->mnt_sb; 1722 struct mount *mnt = real_mount(path->mnt); 1723 1724 if (!capable(CAP_SYS_ADMIN)) 1725 return -EPERM; 1726 1727 if (!check_mnt(mnt)) 1728 return -EINVAL; 1729 1730 if (path->dentry != path->mnt->mnt_root) 1731 return -EINVAL; 1732 1733 err = security_sb_remount(sb, data); 1734 if (err) 1735 return err; 1736 1737 down_write(&sb->s_umount); 1738 if (flags & MS_BIND) 1739 err = change_mount_flags(path->mnt, flags); 1740 else 1741 err = do_remount_sb(sb, flags, data, 0); 1742 if (!err) { 1743 br_write_lock(&vfsmount_lock); 1744 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; 1745 mnt->mnt.mnt_flags = mnt_flags; 1746 br_write_unlock(&vfsmount_lock); 1747 } 1748 up_write(&sb->s_umount); 1749 if (!err) { 1750 br_write_lock(&vfsmount_lock); 1751 touch_mnt_namespace(mnt->mnt_ns); 1752 br_write_unlock(&vfsmount_lock); 1753 } 1754 return err; 1755 } 1756 1757 static inline int tree_contains_unbindable(struct mount *mnt) 1758 { 1759 struct mount *p; 1760 for (p = mnt; p; p = next_mnt(p, mnt)) { 1761 if (IS_MNT_UNBINDABLE(p)) 1762 return 1; 1763 } 1764 return 0; 1765 } 1766 1767 static int do_move_mount(struct path *path, const char *old_name) 1768 { 1769 struct path old_path, parent_path; 1770 struct mount *p; 1771 struct mount *old; 1772 int err = 0; 1773 if (!capable(CAP_SYS_ADMIN)) 1774 return -EPERM; 1775 if (!old_name || !*old_name) 1776 return -EINVAL; 1777 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1778 if (err) 1779 return err; 1780 1781 err = lock_mount(path); 1782 if (err < 0) 1783 goto out; 1784 1785 old = real_mount(old_path.mnt); 1786 p = real_mount(path->mnt); 1787 1788 err = -EINVAL; 1789 if (!check_mnt(p) || !check_mnt(old)) 1790 goto out1; 1791 1792 if (d_unlinked(path->dentry)) 1793 goto out1; 1794 1795 err = -EINVAL; 1796 if (old_path.dentry != old_path.mnt->mnt_root) 1797 goto out1; 1798 1799 if (!mnt_has_parent(old)) 1800 goto out1; 1801 1802 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1803 S_ISDIR(old_path.dentry->d_inode->i_mode)) 1804 goto out1; 1805 /* 1806 * Don't move a mount residing in a shared parent. 1807 */ 1808 if (IS_MNT_SHARED(old->mnt_parent)) 1809 goto out1; 1810 /* 1811 * Don't move a mount tree containing unbindable mounts to a destination 1812 * mount which is shared. 1813 */ 1814 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) 1815 goto out1; 1816 err = -ELOOP; 1817 for (; mnt_has_parent(p); p = p->mnt_parent) 1818 if (p == old) 1819 goto out1; 1820 1821 err = attach_recursive_mnt(old, path, &parent_path); 1822 if (err) 1823 goto out1; 1824 1825 /* if the mount is moved, it should no longer be expire 1826 * automatically */ 1827 list_del_init(&old->mnt_expire); 1828 out1: 1829 unlock_mount(path); 1830 out: 1831 if (!err) 1832 path_put(&parent_path); 1833 path_put(&old_path); 1834 return err; 1835 } 1836 1837 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 1838 { 1839 int err; 1840 const char *subtype = strchr(fstype, '.'); 1841 if (subtype) { 1842 subtype++; 1843 err = -EINVAL; 1844 if (!subtype[0]) 1845 goto err; 1846 } else 1847 subtype = ""; 1848 1849 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); 1850 err = -ENOMEM; 1851 if (!mnt->mnt_sb->s_subtype) 1852 goto err; 1853 return mnt; 1854 1855 err: 1856 mntput(mnt); 1857 return ERR_PTR(err); 1858 } 1859 1860 static struct vfsmount * 1861 do_kern_mount(const char *fstype, int flags, const char *name, void *data) 1862 { 1863 struct file_system_type *type = get_fs_type(fstype); 1864 struct vfsmount *mnt; 1865 if (!type) 1866 return ERR_PTR(-ENODEV); 1867 mnt = vfs_kern_mount(type, flags, name, data); 1868 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 1869 !mnt->mnt_sb->s_subtype) 1870 mnt = fs_set_subtype(mnt, fstype); 1871 put_filesystem(type); 1872 return mnt; 1873 } 1874 1875 /* 1876 * add a mount into a namespace's mount tree 1877 */ 1878 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) 1879 { 1880 int err; 1881 1882 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 1883 1884 err = lock_mount(path); 1885 if (err) 1886 return err; 1887 1888 err = -EINVAL; 1889 if (unlikely(!check_mnt(real_mount(path->mnt)))) { 1890 /* that's acceptable only for automounts done in private ns */ 1891 if (!(mnt_flags & MNT_SHRINKABLE)) 1892 goto unlock; 1893 /* ... and for those we'd better have mountpoint still alive */ 1894 if (!real_mount(path->mnt)->mnt_ns) 1895 goto unlock; 1896 } 1897 1898 /* Refuse the same filesystem on the same mount point */ 1899 err = -EBUSY; 1900 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && 1901 path->mnt->mnt_root == path->dentry) 1902 goto unlock; 1903 1904 err = -EINVAL; 1905 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) 1906 goto unlock; 1907 1908 newmnt->mnt.mnt_flags = mnt_flags; 1909 err = graft_tree(newmnt, path); 1910 1911 unlock: 1912 unlock_mount(path); 1913 return err; 1914 } 1915 1916 /* 1917 * create a new mount for userspace and request it to be added into the 1918 * namespace's tree 1919 */ 1920 static int do_new_mount(struct path *path, const char *type, int flags, 1921 int mnt_flags, const char *name, void *data) 1922 { 1923 struct vfsmount *mnt; 1924 int err; 1925 1926 if (!type) 1927 return -EINVAL; 1928 1929 /* we need capabilities... */ 1930 if (!capable(CAP_SYS_ADMIN)) 1931 return -EPERM; 1932 1933 mnt = do_kern_mount(type, flags, name, data); 1934 if (IS_ERR(mnt)) 1935 return PTR_ERR(mnt); 1936 1937 err = do_add_mount(real_mount(mnt), path, mnt_flags); 1938 if (err) 1939 mntput(mnt); 1940 return err; 1941 } 1942 1943 int finish_automount(struct vfsmount *m, struct path *path) 1944 { 1945 struct mount *mnt = real_mount(m); 1946 int err; 1947 /* The new mount record should have at least 2 refs to prevent it being 1948 * expired before we get a chance to add it 1949 */ 1950 BUG_ON(mnt_get_count(mnt) < 2); 1951 1952 if (m->mnt_sb == path->mnt->mnt_sb && 1953 m->mnt_root == path->dentry) { 1954 err = -ELOOP; 1955 goto fail; 1956 } 1957 1958 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); 1959 if (!err) 1960 return 0; 1961 fail: 1962 /* remove m from any expiration list it may be on */ 1963 if (!list_empty(&mnt->mnt_expire)) { 1964 down_write(&namespace_sem); 1965 br_write_lock(&vfsmount_lock); 1966 list_del_init(&mnt->mnt_expire); 1967 br_write_unlock(&vfsmount_lock); 1968 up_write(&namespace_sem); 1969 } 1970 mntput(m); 1971 mntput(m); 1972 return err; 1973 } 1974 1975 /** 1976 * mnt_set_expiry - Put a mount on an expiration list 1977 * @mnt: The mount to list. 1978 * @expiry_list: The list to add the mount to. 1979 */ 1980 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 1981 { 1982 down_write(&namespace_sem); 1983 br_write_lock(&vfsmount_lock); 1984 1985 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 1986 1987 br_write_unlock(&vfsmount_lock); 1988 up_write(&namespace_sem); 1989 } 1990 EXPORT_SYMBOL(mnt_set_expiry); 1991 1992 /* 1993 * process a list of expirable mountpoints with the intent of discarding any 1994 * mountpoints that aren't in use and haven't been touched since last we came 1995 * here 1996 */ 1997 void mark_mounts_for_expiry(struct list_head *mounts) 1998 { 1999 struct mount *mnt, *next; 2000 LIST_HEAD(graveyard); 2001 LIST_HEAD(umounts); 2002 2003 if (list_empty(mounts)) 2004 return; 2005 2006 down_write(&namespace_sem); 2007 br_write_lock(&vfsmount_lock); 2008 2009 /* extract from the expiration list every vfsmount that matches the 2010 * following criteria: 2011 * - only referenced by its parent vfsmount 2012 * - still marked for expiry (marked on the last call here; marks are 2013 * cleared by mntput()) 2014 */ 2015 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2016 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2017 propagate_mount_busy(mnt, 1)) 2018 continue; 2019 list_move(&mnt->mnt_expire, &graveyard); 2020 } 2021 while (!list_empty(&graveyard)) { 2022 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 2023 touch_mnt_namespace(mnt->mnt_ns); 2024 umount_tree(mnt, 1, &umounts); 2025 } 2026 br_write_unlock(&vfsmount_lock); 2027 up_write(&namespace_sem); 2028 2029 release_mounts(&umounts); 2030 } 2031 2032 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2033 2034 /* 2035 * Ripoff of 'select_parent()' 2036 * 2037 * search the list of submounts for a given mountpoint, and move any 2038 * shrinkable submounts to the 'graveyard' list. 2039 */ 2040 static int select_submounts(struct mount *parent, struct list_head *graveyard) 2041 { 2042 struct mount *this_parent = parent; 2043 struct list_head *next; 2044 int found = 0; 2045 2046 repeat: 2047 next = this_parent->mnt_mounts.next; 2048 resume: 2049 while (next != &this_parent->mnt_mounts) { 2050 struct list_head *tmp = next; 2051 struct mount *mnt = list_entry(tmp, struct mount, mnt_child); 2052 2053 next = tmp->next; 2054 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) 2055 continue; 2056 /* 2057 * Descend a level if the d_mounts list is non-empty. 2058 */ 2059 if (!list_empty(&mnt->mnt_mounts)) { 2060 this_parent = mnt; 2061 goto repeat; 2062 } 2063 2064 if (!propagate_mount_busy(mnt, 1)) { 2065 list_move_tail(&mnt->mnt_expire, graveyard); 2066 found++; 2067 } 2068 } 2069 /* 2070 * All done at this level ... ascend and resume the search 2071 */ 2072 if (this_parent != parent) { 2073 next = this_parent->mnt_child.next; 2074 this_parent = this_parent->mnt_parent; 2075 goto resume; 2076 } 2077 return found; 2078 } 2079 2080 /* 2081 * process a list of expirable mountpoints with the intent of discarding any 2082 * submounts of a specific parent mountpoint 2083 * 2084 * vfsmount_lock must be held for write 2085 */ 2086 static void shrink_submounts(struct mount *mnt, struct list_head *umounts) 2087 { 2088 LIST_HEAD(graveyard); 2089 struct mount *m; 2090 2091 /* extract submounts of 'mountpoint' from the expiration list */ 2092 while (select_submounts(mnt, &graveyard)) { 2093 while (!list_empty(&graveyard)) { 2094 m = list_first_entry(&graveyard, struct mount, 2095 mnt_expire); 2096 touch_mnt_namespace(m->mnt_ns); 2097 umount_tree(m, 1, umounts); 2098 } 2099 } 2100 } 2101 2102 /* 2103 * Some copy_from_user() implementations do not return the exact number of 2104 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2105 * Note that this function differs from copy_from_user() in that it will oops 2106 * on bad values of `to', rather than returning a short copy. 2107 */ 2108 static long exact_copy_from_user(void *to, const void __user * from, 2109 unsigned long n) 2110 { 2111 char *t = to; 2112 const char __user *f = from; 2113 char c; 2114 2115 if (!access_ok(VERIFY_READ, from, n)) 2116 return n; 2117 2118 while (n) { 2119 if (__get_user(c, f)) { 2120 memset(t, 0, n); 2121 break; 2122 } 2123 *t++ = c; 2124 f++; 2125 n--; 2126 } 2127 return n; 2128 } 2129 2130 int copy_mount_options(const void __user * data, unsigned long *where) 2131 { 2132 int i; 2133 unsigned long page; 2134 unsigned long size; 2135 2136 *where = 0; 2137 if (!data) 2138 return 0; 2139 2140 if (!(page = __get_free_page(GFP_KERNEL))) 2141 return -ENOMEM; 2142 2143 /* We only care that *some* data at the address the user 2144 * gave us is valid. Just in case, we'll zero 2145 * the remainder of the page. 2146 */ 2147 /* copy_from_user cannot cross TASK_SIZE ! */ 2148 size = TASK_SIZE - (unsigned long)data; 2149 if (size > PAGE_SIZE) 2150 size = PAGE_SIZE; 2151 2152 i = size - exact_copy_from_user((void *)page, data, size); 2153 if (!i) { 2154 free_page(page); 2155 return -EFAULT; 2156 } 2157 if (i != PAGE_SIZE) 2158 memset((char *)page + i, 0, PAGE_SIZE - i); 2159 *where = page; 2160 return 0; 2161 } 2162 2163 int copy_mount_string(const void __user *data, char **where) 2164 { 2165 char *tmp; 2166 2167 if (!data) { 2168 *where = NULL; 2169 return 0; 2170 } 2171 2172 tmp = strndup_user(data, PAGE_SIZE); 2173 if (IS_ERR(tmp)) 2174 return PTR_ERR(tmp); 2175 2176 *where = tmp; 2177 return 0; 2178 } 2179 2180 /* 2181 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2182 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2183 * 2184 * data is a (void *) that can point to any structure up to 2185 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2186 * information (or be NULL). 2187 * 2188 * Pre-0.97 versions of mount() didn't have a flags word. 2189 * When the flags word was introduced its top half was required 2190 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2191 * Therefore, if this magic number is present, it carries no information 2192 * and must be discarded. 2193 */ 2194 long do_mount(const char *dev_name, const char *dir_name, 2195 const char *type_page, unsigned long flags, void *data_page) 2196 { 2197 struct path path; 2198 int retval = 0; 2199 int mnt_flags = 0; 2200 2201 /* Discard magic */ 2202 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2203 flags &= ~MS_MGC_MSK; 2204 2205 /* Basic sanity checks */ 2206 2207 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 2208 return -EINVAL; 2209 2210 if (data_page) 2211 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2212 2213 /* ... and get the mountpoint */ 2214 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 2215 if (retval) 2216 return retval; 2217 2218 retval = security_sb_mount(dev_name, &path, 2219 type_page, flags, data_page); 2220 if (retval) 2221 goto dput_out; 2222 2223 /* Default to relatime unless overriden */ 2224 if (!(flags & MS_NOATIME)) 2225 mnt_flags |= MNT_RELATIME; 2226 2227 /* Separate the per-mountpoint flags */ 2228 if (flags & MS_NOSUID) 2229 mnt_flags |= MNT_NOSUID; 2230 if (flags & MS_NODEV) 2231 mnt_flags |= MNT_NODEV; 2232 if (flags & MS_NOEXEC) 2233 mnt_flags |= MNT_NOEXEC; 2234 if (flags & MS_NOATIME) 2235 mnt_flags |= MNT_NOATIME; 2236 if (flags & MS_NODIRATIME) 2237 mnt_flags |= MNT_NODIRATIME; 2238 if (flags & MS_STRICTATIME) 2239 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2240 if (flags & MS_RDONLY) 2241 mnt_flags |= MNT_READONLY; 2242 2243 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2244 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2245 MS_STRICTATIME); 2246 2247 if (flags & MS_REMOUNT) 2248 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2249 data_page); 2250 else if (flags & MS_BIND) 2251 retval = do_loopback(&path, dev_name, flags & MS_REC); 2252 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2253 retval = do_change_type(&path, flags); 2254 else if (flags & MS_MOVE) 2255 retval = do_move_mount(&path, dev_name); 2256 else 2257 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2258 dev_name, data_page); 2259 dput_out: 2260 path_put(&path); 2261 return retval; 2262 } 2263 2264 static struct mnt_namespace *alloc_mnt_ns(void) 2265 { 2266 struct mnt_namespace *new_ns; 2267 2268 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2269 if (!new_ns) 2270 return ERR_PTR(-ENOMEM); 2271 atomic_set(&new_ns->count, 1); 2272 new_ns->root = NULL; 2273 INIT_LIST_HEAD(&new_ns->list); 2274 init_waitqueue_head(&new_ns->poll); 2275 new_ns->event = 0; 2276 return new_ns; 2277 } 2278 2279 /* 2280 * Allocate a new namespace structure and populate it with contents 2281 * copied from the namespace of the passed in task structure. 2282 */ 2283 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2284 struct fs_struct *fs) 2285 { 2286 struct mnt_namespace *new_ns; 2287 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2288 struct mount *p, *q; 2289 struct mount *old = mnt_ns->root; 2290 struct mount *new; 2291 2292 new_ns = alloc_mnt_ns(); 2293 if (IS_ERR(new_ns)) 2294 return new_ns; 2295 2296 down_write(&namespace_sem); 2297 /* First pass: copy the tree topology */ 2298 new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); 2299 if (IS_ERR(new)) { 2300 up_write(&namespace_sem); 2301 kfree(new_ns); 2302 return ERR_CAST(new); 2303 } 2304 new_ns->root = new; 2305 br_write_lock(&vfsmount_lock); 2306 list_add_tail(&new_ns->list, &new->mnt_list); 2307 br_write_unlock(&vfsmount_lock); 2308 2309 /* 2310 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2311 * as belonging to new namespace. We have already acquired a private 2312 * fs_struct, so tsk->fs->lock is not needed. 2313 */ 2314 p = old; 2315 q = new; 2316 while (p) { 2317 q->mnt_ns = new_ns; 2318 if (fs) { 2319 if (&p->mnt == fs->root.mnt) { 2320 fs->root.mnt = mntget(&q->mnt); 2321 rootmnt = &p->mnt; 2322 } 2323 if (&p->mnt == fs->pwd.mnt) { 2324 fs->pwd.mnt = mntget(&q->mnt); 2325 pwdmnt = &p->mnt; 2326 } 2327 } 2328 p = next_mnt(p, old); 2329 q = next_mnt(q, new); 2330 } 2331 up_write(&namespace_sem); 2332 2333 if (rootmnt) 2334 mntput(rootmnt); 2335 if (pwdmnt) 2336 mntput(pwdmnt); 2337 2338 return new_ns; 2339 } 2340 2341 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2342 struct fs_struct *new_fs) 2343 { 2344 struct mnt_namespace *new_ns; 2345 2346 BUG_ON(!ns); 2347 get_mnt_ns(ns); 2348 2349 if (!(flags & CLONE_NEWNS)) 2350 return ns; 2351 2352 new_ns = dup_mnt_ns(ns, new_fs); 2353 2354 put_mnt_ns(ns); 2355 return new_ns; 2356 } 2357 2358 /** 2359 * create_mnt_ns - creates a private namespace and adds a root filesystem 2360 * @mnt: pointer to the new root filesystem mountpoint 2361 */ 2362 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2363 { 2364 struct mnt_namespace *new_ns = alloc_mnt_ns(); 2365 if (!IS_ERR(new_ns)) { 2366 struct mount *mnt = real_mount(m); 2367 mnt->mnt_ns = new_ns; 2368 new_ns->root = mnt; 2369 list_add(&new_ns->list, &mnt->mnt_list); 2370 } else { 2371 mntput(m); 2372 } 2373 return new_ns; 2374 } 2375 2376 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) 2377 { 2378 struct mnt_namespace *ns; 2379 struct super_block *s; 2380 struct path path; 2381 int err; 2382 2383 ns = create_mnt_ns(mnt); 2384 if (IS_ERR(ns)) 2385 return ERR_CAST(ns); 2386 2387 err = vfs_path_lookup(mnt->mnt_root, mnt, 2388 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2389 2390 put_mnt_ns(ns); 2391 2392 if (err) 2393 return ERR_PTR(err); 2394 2395 /* trade a vfsmount reference for active sb one */ 2396 s = path.mnt->mnt_sb; 2397 atomic_inc(&s->s_active); 2398 mntput(path.mnt); 2399 /* lock the sucker */ 2400 down_write(&s->s_umount); 2401 /* ... and return the root of (sub)tree on it */ 2402 return path.dentry; 2403 } 2404 EXPORT_SYMBOL(mount_subtree); 2405 2406 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2407 char __user *, type, unsigned long, flags, void __user *, data) 2408 { 2409 int ret; 2410 char *kernel_type; 2411 struct filename *kernel_dir; 2412 char *kernel_dev; 2413 unsigned long data_page; 2414 2415 ret = copy_mount_string(type, &kernel_type); 2416 if (ret < 0) 2417 goto out_type; 2418 2419 kernel_dir = getname(dir_name); 2420 if (IS_ERR(kernel_dir)) { 2421 ret = PTR_ERR(kernel_dir); 2422 goto out_dir; 2423 } 2424 2425 ret = copy_mount_string(dev_name, &kernel_dev); 2426 if (ret < 0) 2427 goto out_dev; 2428 2429 ret = copy_mount_options(data, &data_page); 2430 if (ret < 0) 2431 goto out_data; 2432 2433 ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, 2434 (void *) data_page); 2435 2436 free_page(data_page); 2437 out_data: 2438 kfree(kernel_dev); 2439 out_dev: 2440 putname(kernel_dir); 2441 out_dir: 2442 kfree(kernel_type); 2443 out_type: 2444 return ret; 2445 } 2446 2447 /* 2448 * Return true if path is reachable from root 2449 * 2450 * namespace_sem or vfsmount_lock is held 2451 */ 2452 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2453 const struct path *root) 2454 { 2455 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { 2456 dentry = mnt->mnt_mountpoint; 2457 mnt = mnt->mnt_parent; 2458 } 2459 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); 2460 } 2461 2462 int path_is_under(struct path *path1, struct path *path2) 2463 { 2464 int res; 2465 br_read_lock(&vfsmount_lock); 2466 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2467 br_read_unlock(&vfsmount_lock); 2468 return res; 2469 } 2470 EXPORT_SYMBOL(path_is_under); 2471 2472 /* 2473 * pivot_root Semantics: 2474 * Moves the root file system of the current process to the directory put_old, 2475 * makes new_root as the new root file system of the current process, and sets 2476 * root/cwd of all processes which had them on the current root to new_root. 2477 * 2478 * Restrictions: 2479 * The new_root and put_old must be directories, and must not be on the 2480 * same file system as the current process root. The put_old must be 2481 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2482 * pointed to by put_old must yield the same directory as new_root. No other 2483 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2484 * 2485 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2486 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2487 * in this situation. 2488 * 2489 * Notes: 2490 * - we don't move root/cwd if they are not at the root (reason: if something 2491 * cared enough to change them, it's probably wrong to force them elsewhere) 2492 * - it's okay to pick a root that isn't the root of a file system, e.g. 2493 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2494 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2495 * first. 2496 */ 2497 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2498 const char __user *, put_old) 2499 { 2500 struct path new, old, parent_path, root_parent, root; 2501 struct mount *new_mnt, *root_mnt; 2502 int error; 2503 2504 if (!capable(CAP_SYS_ADMIN)) 2505 return -EPERM; 2506 2507 error = user_path_dir(new_root, &new); 2508 if (error) 2509 goto out0; 2510 2511 error = user_path_dir(put_old, &old); 2512 if (error) 2513 goto out1; 2514 2515 error = security_sb_pivotroot(&old, &new); 2516 if (error) 2517 goto out2; 2518 2519 get_fs_root(current->fs, &root); 2520 error = lock_mount(&old); 2521 if (error) 2522 goto out3; 2523 2524 error = -EINVAL; 2525 new_mnt = real_mount(new.mnt); 2526 root_mnt = real_mount(root.mnt); 2527 if (IS_MNT_SHARED(real_mount(old.mnt)) || 2528 IS_MNT_SHARED(new_mnt->mnt_parent) || 2529 IS_MNT_SHARED(root_mnt->mnt_parent)) 2530 goto out4; 2531 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 2532 goto out4; 2533 error = -ENOENT; 2534 if (d_unlinked(new.dentry)) 2535 goto out4; 2536 if (d_unlinked(old.dentry)) 2537 goto out4; 2538 error = -EBUSY; 2539 if (new.mnt == root.mnt || 2540 old.mnt == root.mnt) 2541 goto out4; /* loop, on the same file system */ 2542 error = -EINVAL; 2543 if (root.mnt->mnt_root != root.dentry) 2544 goto out4; /* not a mountpoint */ 2545 if (!mnt_has_parent(root_mnt)) 2546 goto out4; /* not attached */ 2547 if (new.mnt->mnt_root != new.dentry) 2548 goto out4; /* not a mountpoint */ 2549 if (!mnt_has_parent(new_mnt)) 2550 goto out4; /* not attached */ 2551 /* make sure we can reach put_old from new_root */ 2552 if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) 2553 goto out4; 2554 br_write_lock(&vfsmount_lock); 2555 detach_mnt(new_mnt, &parent_path); 2556 detach_mnt(root_mnt, &root_parent); 2557 /* mount old root on put_old */ 2558 attach_mnt(root_mnt, &old); 2559 /* mount new_root on / */ 2560 attach_mnt(new_mnt, &root_parent); 2561 touch_mnt_namespace(current->nsproxy->mnt_ns); 2562 br_write_unlock(&vfsmount_lock); 2563 chroot_fs_refs(&root, &new); 2564 error = 0; 2565 out4: 2566 unlock_mount(&old); 2567 if (!error) { 2568 path_put(&root_parent); 2569 path_put(&parent_path); 2570 } 2571 out3: 2572 path_put(&root); 2573 out2: 2574 path_put(&old); 2575 out1: 2576 path_put(&new); 2577 out0: 2578 return error; 2579 } 2580 2581 static void __init init_mount_tree(void) 2582 { 2583 struct vfsmount *mnt; 2584 struct mnt_namespace *ns; 2585 struct path root; 2586 2587 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2588 if (IS_ERR(mnt)) 2589 panic("Can't create rootfs"); 2590 2591 ns = create_mnt_ns(mnt); 2592 if (IS_ERR(ns)) 2593 panic("Can't allocate initial namespace"); 2594 2595 init_task.nsproxy->mnt_ns = ns; 2596 get_mnt_ns(ns); 2597 2598 root.mnt = mnt; 2599 root.dentry = mnt->mnt_root; 2600 2601 set_fs_pwd(current->fs, &root); 2602 set_fs_root(current->fs, &root); 2603 } 2604 2605 void __init mnt_init(void) 2606 { 2607 unsigned u; 2608 int err; 2609 2610 init_rwsem(&namespace_sem); 2611 2612 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 2613 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2614 2615 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 2616 2617 if (!mount_hashtable) 2618 panic("Failed to allocate mount hash table\n"); 2619 2620 printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); 2621 2622 for (u = 0; u < HASH_SIZE; u++) 2623 INIT_LIST_HEAD(&mount_hashtable[u]); 2624 2625 br_lock_init(&vfsmount_lock); 2626 2627 err = sysfs_init(); 2628 if (err) 2629 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2630 __func__, err); 2631 fs_kobj = kobject_create_and_add("fs", NULL); 2632 if (!fs_kobj) 2633 printk(KERN_WARNING "%s: kobj create error\n", __func__); 2634 init_rootfs(); 2635 init_mount_tree(); 2636 } 2637 2638 void put_mnt_ns(struct mnt_namespace *ns) 2639 { 2640 LIST_HEAD(umount_list); 2641 2642 if (!atomic_dec_and_test(&ns->count)) 2643 return; 2644 down_write(&namespace_sem); 2645 br_write_lock(&vfsmount_lock); 2646 umount_tree(ns->root, 0, &umount_list); 2647 br_write_unlock(&vfsmount_lock); 2648 up_write(&namespace_sem); 2649 release_mounts(&umount_list); 2650 kfree(ns); 2651 } 2652 2653 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 2654 { 2655 struct vfsmount *mnt; 2656 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); 2657 if (!IS_ERR(mnt)) { 2658 /* 2659 * it is a longterm mount, don't release mnt until 2660 * we unmount before file sys is unregistered 2661 */ 2662 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 2663 } 2664 return mnt; 2665 } 2666 EXPORT_SYMBOL_GPL(kern_mount_data); 2667 2668 void kern_unmount(struct vfsmount *mnt) 2669 { 2670 /* release long term mount so mount point can be released */ 2671 if (!IS_ERR_OR_NULL(mnt)) { 2672 br_write_lock(&vfsmount_lock); 2673 real_mount(mnt)->mnt_ns = NULL; 2674 br_write_unlock(&vfsmount_lock); 2675 mntput(mnt); 2676 } 2677 } 2678 EXPORT_SYMBOL(kern_unmount); 2679 2680 bool our_mnt(struct vfsmount *mnt) 2681 { 2682 return check_mnt(real_mount(mnt)); 2683 } 2684