1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/slab.h> 13 #include <linux/sched.h> 14 #include <linux/spinlock.h> 15 #include <linux/percpu.h> 16 #include <linux/init.h> 17 #include <linux/kernel.h> 18 #include <linux/acct.h> 19 #include <linux/capability.h> 20 #include <linux/cpumask.h> 21 #include <linux/module.h> 22 #include <linux/sysfs.h> 23 #include <linux/seq_file.h> 24 #include <linux/mnt_namespace.h> 25 #include <linux/namei.h> 26 #include <linux/nsproxy.h> 27 #include <linux/security.h> 28 #include <linux/mount.h> 29 #include <linux/ramfs.h> 30 #include <linux/log2.h> 31 #include <linux/idr.h> 32 #include <linux/fs_struct.h> 33 #include <linux/fsnotify.h> 34 #include <asm/uaccess.h> 35 #include <asm/unistd.h> 36 #include "pnode.h" 37 #include "internal.h" 38 39 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 40 #define HASH_SIZE (1UL << HASH_SHIFT) 41 42 static int event; 43 static DEFINE_IDA(mnt_id_ida); 44 static DEFINE_IDA(mnt_group_ida); 45 static DEFINE_SPINLOCK(mnt_id_lock); 46 static int mnt_id_start = 0; 47 static int mnt_group_start = 1; 48 49 static struct list_head *mount_hashtable __read_mostly; 50 static struct kmem_cache *mnt_cache __read_mostly; 51 static struct rw_semaphore namespace_sem; 52 53 /* /sys/fs */ 54 struct kobject *fs_kobj; 55 EXPORT_SYMBOL_GPL(fs_kobj); 56 57 /* 58 * vfsmount lock may be taken for read to prevent changes to the 59 * vfsmount hash, ie. during mountpoint lookups or walking back 60 * up the tree. 61 * 62 * It should be taken for write in all cases where the vfsmount 63 * tree or hash is modified or when a vfsmount structure is modified. 64 */ 65 DEFINE_BRLOCK(vfsmount_lock); 66 67 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 68 { 69 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 70 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 71 tmp = tmp + (tmp >> HASH_SHIFT); 72 return tmp & (HASH_SIZE - 1); 73 } 74 75 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 76 77 /* 78 * allocation is serialized by namespace_sem, but we need the spinlock to 79 * serialize with freeing. 80 */ 81 static int mnt_alloc_id(struct vfsmount *mnt) 82 { 83 int res; 84 85 retry: 86 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 87 spin_lock(&mnt_id_lock); 88 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 89 if (!res) 90 mnt_id_start = mnt->mnt_id + 1; 91 spin_unlock(&mnt_id_lock); 92 if (res == -EAGAIN) 93 goto retry; 94 95 return res; 96 } 97 98 static void mnt_free_id(struct vfsmount *mnt) 99 { 100 int id = mnt->mnt_id; 101 spin_lock(&mnt_id_lock); 102 ida_remove(&mnt_id_ida, id); 103 if (mnt_id_start > id) 104 mnt_id_start = id; 105 spin_unlock(&mnt_id_lock); 106 } 107 108 /* 109 * Allocate a new peer group ID 110 * 111 * mnt_group_ida is protected by namespace_sem 112 */ 113 static int mnt_alloc_group_id(struct vfsmount *mnt) 114 { 115 int res; 116 117 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 118 return -ENOMEM; 119 120 res = ida_get_new_above(&mnt_group_ida, 121 mnt_group_start, 122 &mnt->mnt_group_id); 123 if (!res) 124 mnt_group_start = mnt->mnt_group_id + 1; 125 126 return res; 127 } 128 129 /* 130 * Release a peer group ID 131 */ 132 void mnt_release_group_id(struct vfsmount *mnt) 133 { 134 int id = mnt->mnt_group_id; 135 ida_remove(&mnt_group_ida, id); 136 if (mnt_group_start > id) 137 mnt_group_start = id; 138 mnt->mnt_group_id = 0; 139 } 140 141 /* 142 * vfsmount lock must be held for read 143 */ 144 static inline void mnt_add_count(struct vfsmount *mnt, int n) 145 { 146 #ifdef CONFIG_SMP 147 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 148 #else 149 preempt_disable(); 150 mnt->mnt_count += n; 151 preempt_enable(); 152 #endif 153 } 154 155 static inline void mnt_set_count(struct vfsmount *mnt, int n) 156 { 157 #ifdef CONFIG_SMP 158 this_cpu_write(mnt->mnt_pcp->mnt_count, n); 159 #else 160 mnt->mnt_count = n; 161 #endif 162 } 163 164 /* 165 * vfsmount lock must be held for read 166 */ 167 static inline void mnt_inc_count(struct vfsmount *mnt) 168 { 169 mnt_add_count(mnt, 1); 170 } 171 172 /* 173 * vfsmount lock must be held for read 174 */ 175 static inline void mnt_dec_count(struct vfsmount *mnt) 176 { 177 mnt_add_count(mnt, -1); 178 } 179 180 /* 181 * vfsmount lock must be held for write 182 */ 183 unsigned int mnt_get_count(struct vfsmount *mnt) 184 { 185 #ifdef CONFIG_SMP 186 unsigned int count = 0; 187 int cpu; 188 189 for_each_possible_cpu(cpu) { 190 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 191 } 192 193 return count; 194 #else 195 return mnt->mnt_count; 196 #endif 197 } 198 199 struct vfsmount *alloc_vfsmnt(const char *name) 200 { 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 202 if (mnt) { 203 int err; 204 205 err = mnt_alloc_id(mnt); 206 if (err) 207 goto out_free_cache; 208 209 if (name) { 210 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 211 if (!mnt->mnt_devname) 212 goto out_free_id; 213 } 214 215 #ifdef CONFIG_SMP 216 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 217 if (!mnt->mnt_pcp) 218 goto out_free_devname; 219 220 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 221 #else 222 mnt->mnt_count = 1; 223 mnt->mnt_writers = 0; 224 #endif 225 226 INIT_LIST_HEAD(&mnt->mnt_hash); 227 INIT_LIST_HEAD(&mnt->mnt_child); 228 INIT_LIST_HEAD(&mnt->mnt_mounts); 229 INIT_LIST_HEAD(&mnt->mnt_list); 230 INIT_LIST_HEAD(&mnt->mnt_expire); 231 INIT_LIST_HEAD(&mnt->mnt_share); 232 INIT_LIST_HEAD(&mnt->mnt_slave_list); 233 INIT_LIST_HEAD(&mnt->mnt_slave); 234 #ifdef CONFIG_FSNOTIFY 235 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 236 #endif 237 } 238 return mnt; 239 240 #ifdef CONFIG_SMP 241 out_free_devname: 242 kfree(mnt->mnt_devname); 243 #endif 244 out_free_id: 245 mnt_free_id(mnt); 246 out_free_cache: 247 kmem_cache_free(mnt_cache, mnt); 248 return NULL; 249 } 250 251 /* 252 * Most r/o checks on a fs are for operations that take 253 * discrete amounts of time, like a write() or unlink(). 254 * We must keep track of when those operations start 255 * (for permission checks) and when they end, so that 256 * we can determine when writes are able to occur to 257 * a filesystem. 258 */ 259 /* 260 * __mnt_is_readonly: check whether a mount is read-only 261 * @mnt: the mount to check for its write status 262 * 263 * This shouldn't be used directly ouside of the VFS. 264 * It does not guarantee that the filesystem will stay 265 * r/w, just that it is right *now*. This can not and 266 * should not be used in place of IS_RDONLY(inode). 267 * mnt_want/drop_write() will _keep_ the filesystem 268 * r/w. 269 */ 270 int __mnt_is_readonly(struct vfsmount *mnt) 271 { 272 if (mnt->mnt_flags & MNT_READONLY) 273 return 1; 274 if (mnt->mnt_sb->s_flags & MS_RDONLY) 275 return 1; 276 return 0; 277 } 278 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 279 280 static inline void mnt_inc_writers(struct vfsmount *mnt) 281 { 282 #ifdef CONFIG_SMP 283 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 284 #else 285 mnt->mnt_writers++; 286 #endif 287 } 288 289 static inline void mnt_dec_writers(struct vfsmount *mnt) 290 { 291 #ifdef CONFIG_SMP 292 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 293 #else 294 mnt->mnt_writers--; 295 #endif 296 } 297 298 static unsigned int mnt_get_writers(struct vfsmount *mnt) 299 { 300 #ifdef CONFIG_SMP 301 unsigned int count = 0; 302 int cpu; 303 304 for_each_possible_cpu(cpu) { 305 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 306 } 307 308 return count; 309 #else 310 return mnt->mnt_writers; 311 #endif 312 } 313 314 /* 315 * Most r/o checks on a fs are for operations that take 316 * discrete amounts of time, like a write() or unlink(). 317 * We must keep track of when those operations start 318 * (for permission checks) and when they end, so that 319 * we can determine when writes are able to occur to 320 * a filesystem. 321 */ 322 /** 323 * mnt_want_write - get write access to a mount 324 * @mnt: the mount on which to take a write 325 * 326 * This tells the low-level filesystem that a write is 327 * about to be performed to it, and makes sure that 328 * writes are allowed before returning success. When 329 * the write operation is finished, mnt_drop_write() 330 * must be called. This is effectively a refcount. 331 */ 332 int mnt_want_write(struct vfsmount *mnt) 333 { 334 int ret = 0; 335 336 preempt_disable(); 337 mnt_inc_writers(mnt); 338 /* 339 * The store to mnt_inc_writers must be visible before we pass 340 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 341 * incremented count after it has set MNT_WRITE_HOLD. 342 */ 343 smp_mb(); 344 while (mnt->mnt_flags & MNT_WRITE_HOLD) 345 cpu_relax(); 346 /* 347 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 348 * be set to match its requirements. So we must not load that until 349 * MNT_WRITE_HOLD is cleared. 350 */ 351 smp_rmb(); 352 if (__mnt_is_readonly(mnt)) { 353 mnt_dec_writers(mnt); 354 ret = -EROFS; 355 goto out; 356 } 357 out: 358 preempt_enable(); 359 return ret; 360 } 361 EXPORT_SYMBOL_GPL(mnt_want_write); 362 363 /** 364 * mnt_clone_write - get write access to a mount 365 * @mnt: the mount on which to take a write 366 * 367 * This is effectively like mnt_want_write, except 368 * it must only be used to take an extra write reference 369 * on a mountpoint that we already know has a write reference 370 * on it. This allows some optimisation. 371 * 372 * After finished, mnt_drop_write must be called as usual to 373 * drop the reference. 374 */ 375 int mnt_clone_write(struct vfsmount *mnt) 376 { 377 /* superblock may be r/o */ 378 if (__mnt_is_readonly(mnt)) 379 return -EROFS; 380 preempt_disable(); 381 mnt_inc_writers(mnt); 382 preempt_enable(); 383 return 0; 384 } 385 EXPORT_SYMBOL_GPL(mnt_clone_write); 386 387 /** 388 * mnt_want_write_file - get write access to a file's mount 389 * @file: the file who's mount on which to take a write 390 * 391 * This is like mnt_want_write, but it takes a file and can 392 * do some optimisations if the file is open for write already 393 */ 394 int mnt_want_write_file(struct file *file) 395 { 396 struct inode *inode = file->f_dentry->d_inode; 397 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 398 return mnt_want_write(file->f_path.mnt); 399 else 400 return mnt_clone_write(file->f_path.mnt); 401 } 402 EXPORT_SYMBOL_GPL(mnt_want_write_file); 403 404 /** 405 * mnt_drop_write - give up write access to a mount 406 * @mnt: the mount on which to give up write access 407 * 408 * Tells the low-level filesystem that we are done 409 * performing writes to it. Must be matched with 410 * mnt_want_write() call above. 411 */ 412 void mnt_drop_write(struct vfsmount *mnt) 413 { 414 preempt_disable(); 415 mnt_dec_writers(mnt); 416 preempt_enable(); 417 } 418 EXPORT_SYMBOL_GPL(mnt_drop_write); 419 420 static int mnt_make_readonly(struct vfsmount *mnt) 421 { 422 int ret = 0; 423 424 br_write_lock(vfsmount_lock); 425 mnt->mnt_flags |= MNT_WRITE_HOLD; 426 /* 427 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 428 * should be visible before we do. 429 */ 430 smp_mb(); 431 432 /* 433 * With writers on hold, if this value is zero, then there are 434 * definitely no active writers (although held writers may subsequently 435 * increment the count, they'll have to wait, and decrement it after 436 * seeing MNT_READONLY). 437 * 438 * It is OK to have counter incremented on one CPU and decremented on 439 * another: the sum will add up correctly. The danger would be when we 440 * sum up each counter, if we read a counter before it is incremented, 441 * but then read another CPU's count which it has been subsequently 442 * decremented from -- we would see more decrements than we should. 443 * MNT_WRITE_HOLD protects against this scenario, because 444 * mnt_want_write first increments count, then smp_mb, then spins on 445 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 446 * we're counting up here. 447 */ 448 if (mnt_get_writers(mnt) > 0) 449 ret = -EBUSY; 450 else 451 mnt->mnt_flags |= MNT_READONLY; 452 /* 453 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 454 * that become unheld will see MNT_READONLY. 455 */ 456 smp_wmb(); 457 mnt->mnt_flags &= ~MNT_WRITE_HOLD; 458 br_write_unlock(vfsmount_lock); 459 return ret; 460 } 461 462 static void __mnt_unmake_readonly(struct vfsmount *mnt) 463 { 464 br_write_lock(vfsmount_lock); 465 mnt->mnt_flags &= ~MNT_READONLY; 466 br_write_unlock(vfsmount_lock); 467 } 468 469 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 470 { 471 mnt->mnt_sb = sb; 472 mnt->mnt_root = dget(sb->s_root); 473 } 474 475 EXPORT_SYMBOL(simple_set_mnt); 476 477 void free_vfsmnt(struct vfsmount *mnt) 478 { 479 kfree(mnt->mnt_devname); 480 mnt_free_id(mnt); 481 #ifdef CONFIG_SMP 482 free_percpu(mnt->mnt_pcp); 483 #endif 484 kmem_cache_free(mnt_cache, mnt); 485 } 486 487 /* 488 * find the first or last mount at @dentry on vfsmount @mnt depending on 489 * @dir. If @dir is set return the first mount else return the last mount. 490 * vfsmount_lock must be held for read or write. 491 */ 492 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 493 int dir) 494 { 495 struct list_head *head = mount_hashtable + hash(mnt, dentry); 496 struct list_head *tmp = head; 497 struct vfsmount *p, *found = NULL; 498 499 for (;;) { 500 tmp = dir ? tmp->next : tmp->prev; 501 p = NULL; 502 if (tmp == head) 503 break; 504 p = list_entry(tmp, struct vfsmount, mnt_hash); 505 if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { 506 found = p; 507 break; 508 } 509 } 510 return found; 511 } 512 513 /* 514 * lookup_mnt increments the ref count before returning 515 * the vfsmount struct. 516 */ 517 struct vfsmount *lookup_mnt(struct path *path) 518 { 519 struct vfsmount *child_mnt; 520 521 br_read_lock(vfsmount_lock); 522 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) 523 mntget(child_mnt); 524 br_read_unlock(vfsmount_lock); 525 return child_mnt; 526 } 527 528 static inline int check_mnt(struct vfsmount *mnt) 529 { 530 return mnt->mnt_ns == current->nsproxy->mnt_ns; 531 } 532 533 /* 534 * vfsmount lock must be held for write 535 */ 536 static void touch_mnt_namespace(struct mnt_namespace *ns) 537 { 538 if (ns) { 539 ns->event = ++event; 540 wake_up_interruptible(&ns->poll); 541 } 542 } 543 544 /* 545 * vfsmount lock must be held for write 546 */ 547 static void __touch_mnt_namespace(struct mnt_namespace *ns) 548 { 549 if (ns && ns->event != event) { 550 ns->event = event; 551 wake_up_interruptible(&ns->poll); 552 } 553 } 554 555 /* 556 * Clear dentry's mounted state if it has no remaining mounts. 557 * vfsmount_lock must be held for write. 558 */ 559 static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) 560 { 561 unsigned u; 562 563 for (u = 0; u < HASH_SIZE; u++) { 564 struct vfsmount *p; 565 566 list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { 567 if (p->mnt_mountpoint == dentry) 568 return; 569 } 570 } 571 spin_lock(&dentry->d_lock); 572 dentry->d_flags &= ~DCACHE_MOUNTED; 573 spin_unlock(&dentry->d_lock); 574 } 575 576 /* 577 * vfsmount lock must be held for write 578 */ 579 static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 580 { 581 old_path->dentry = mnt->mnt_mountpoint; 582 old_path->mnt = mnt->mnt_parent; 583 mnt->mnt_parent = mnt; 584 mnt->mnt_mountpoint = mnt->mnt_root; 585 list_del_init(&mnt->mnt_child); 586 list_del_init(&mnt->mnt_hash); 587 dentry_reset_mounted(old_path->mnt, old_path->dentry); 588 } 589 590 /* 591 * vfsmount lock must be held for write 592 */ 593 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 594 struct vfsmount *child_mnt) 595 { 596 child_mnt->mnt_parent = mntget(mnt); 597 child_mnt->mnt_mountpoint = dget(dentry); 598 spin_lock(&dentry->d_lock); 599 dentry->d_flags |= DCACHE_MOUNTED; 600 spin_unlock(&dentry->d_lock); 601 } 602 603 /* 604 * vfsmount lock must be held for write 605 */ 606 static void attach_mnt(struct vfsmount *mnt, struct path *path) 607 { 608 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 609 list_add_tail(&mnt->mnt_hash, mount_hashtable + 610 hash(path->mnt, path->dentry)); 611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 612 } 613 614 static inline void __mnt_make_longterm(struct vfsmount *mnt) 615 { 616 #ifdef CONFIG_SMP 617 atomic_inc(&mnt->mnt_longterm); 618 #endif 619 } 620 621 /* needs vfsmount lock for write */ 622 static inline void __mnt_make_shortterm(struct vfsmount *mnt) 623 { 624 #ifdef CONFIG_SMP 625 atomic_dec(&mnt->mnt_longterm); 626 #endif 627 } 628 629 /* 630 * vfsmount lock must be held for write 631 */ 632 static void commit_tree(struct vfsmount *mnt) 633 { 634 struct vfsmount *parent = mnt->mnt_parent; 635 struct vfsmount *m; 636 LIST_HEAD(head); 637 struct mnt_namespace *n = parent->mnt_ns; 638 639 BUG_ON(parent == mnt); 640 641 list_add_tail(&head, &mnt->mnt_list); 642 list_for_each_entry(m, &head, mnt_list) { 643 m->mnt_ns = n; 644 __mnt_make_longterm(m); 645 } 646 647 list_splice(&head, n->list.prev); 648 649 list_add_tail(&mnt->mnt_hash, mount_hashtable + 650 hash(parent, mnt->mnt_mountpoint)); 651 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 652 touch_mnt_namespace(n); 653 } 654 655 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) 656 { 657 struct list_head *next = p->mnt_mounts.next; 658 if (next == &p->mnt_mounts) { 659 while (1) { 660 if (p == root) 661 return NULL; 662 next = p->mnt_child.next; 663 if (next != &p->mnt_parent->mnt_mounts) 664 break; 665 p = p->mnt_parent; 666 } 667 } 668 return list_entry(next, struct vfsmount, mnt_child); 669 } 670 671 static struct vfsmount *skip_mnt_tree(struct vfsmount *p) 672 { 673 struct list_head *prev = p->mnt_mounts.prev; 674 while (prev != &p->mnt_mounts) { 675 p = list_entry(prev, struct vfsmount, mnt_child); 676 prev = p->mnt_mounts.prev; 677 } 678 return p; 679 } 680 681 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 682 int flag) 683 { 684 struct super_block *sb = old->mnt_sb; 685 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 686 687 if (mnt) { 688 if (flag & (CL_SLAVE | CL_PRIVATE)) 689 mnt->mnt_group_id = 0; /* not a peer of original */ 690 else 691 mnt->mnt_group_id = old->mnt_group_id; 692 693 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 694 int err = mnt_alloc_group_id(mnt); 695 if (err) 696 goto out_free; 697 } 698 699 mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; 700 atomic_inc(&sb->s_active); 701 mnt->mnt_sb = sb; 702 mnt->mnt_root = dget(root); 703 mnt->mnt_mountpoint = mnt->mnt_root; 704 mnt->mnt_parent = mnt; 705 706 if (flag & CL_SLAVE) { 707 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 708 mnt->mnt_master = old; 709 CLEAR_MNT_SHARED(mnt); 710 } else if (!(flag & CL_PRIVATE)) { 711 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 712 list_add(&mnt->mnt_share, &old->mnt_share); 713 if (IS_MNT_SLAVE(old)) 714 list_add(&mnt->mnt_slave, &old->mnt_slave); 715 mnt->mnt_master = old->mnt_master; 716 } 717 if (flag & CL_MAKE_SHARED) 718 set_mnt_shared(mnt); 719 720 /* stick the duplicate mount on the same expiry list 721 * as the original if that was on one */ 722 if (flag & CL_EXPIRE) { 723 if (!list_empty(&old->mnt_expire)) 724 list_add(&mnt->mnt_expire, &old->mnt_expire); 725 } 726 } 727 return mnt; 728 729 out_free: 730 free_vfsmnt(mnt); 731 return NULL; 732 } 733 734 static inline void mntfree(struct vfsmount *mnt) 735 { 736 struct super_block *sb = mnt->mnt_sb; 737 738 /* 739 * This probably indicates that somebody messed 740 * up a mnt_want/drop_write() pair. If this 741 * happens, the filesystem was probably unable 742 * to make r/w->r/o transitions. 743 */ 744 /* 745 * The locking used to deal with mnt_count decrement provides barriers, 746 * so mnt_get_writers() below is safe. 747 */ 748 WARN_ON(mnt_get_writers(mnt)); 749 fsnotify_vfsmount_delete(mnt); 750 dput(mnt->mnt_root); 751 free_vfsmnt(mnt); 752 deactivate_super(sb); 753 } 754 755 static void mntput_no_expire(struct vfsmount *mnt) 756 { 757 put_again: 758 #ifdef CONFIG_SMP 759 br_read_lock(vfsmount_lock); 760 if (likely(atomic_read(&mnt->mnt_longterm))) { 761 mnt_dec_count(mnt); 762 br_read_unlock(vfsmount_lock); 763 return; 764 } 765 br_read_unlock(vfsmount_lock); 766 767 br_write_lock(vfsmount_lock); 768 mnt_dec_count(mnt); 769 if (mnt_get_count(mnt)) { 770 br_write_unlock(vfsmount_lock); 771 return; 772 } 773 #else 774 mnt_dec_count(mnt); 775 if (likely(mnt_get_count(mnt))) 776 return; 777 br_write_lock(vfsmount_lock); 778 #endif 779 if (unlikely(mnt->mnt_pinned)) { 780 mnt_add_count(mnt, mnt->mnt_pinned + 1); 781 mnt->mnt_pinned = 0; 782 br_write_unlock(vfsmount_lock); 783 acct_auto_close_mnt(mnt); 784 goto put_again; 785 } 786 br_write_unlock(vfsmount_lock); 787 mntfree(mnt); 788 } 789 790 void mntput(struct vfsmount *mnt) 791 { 792 if (mnt) { 793 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 794 if (unlikely(mnt->mnt_expiry_mark)) 795 mnt->mnt_expiry_mark = 0; 796 mntput_no_expire(mnt); 797 } 798 } 799 EXPORT_SYMBOL(mntput); 800 801 struct vfsmount *mntget(struct vfsmount *mnt) 802 { 803 if (mnt) 804 mnt_inc_count(mnt); 805 return mnt; 806 } 807 EXPORT_SYMBOL(mntget); 808 809 void mnt_pin(struct vfsmount *mnt) 810 { 811 br_write_lock(vfsmount_lock); 812 mnt->mnt_pinned++; 813 br_write_unlock(vfsmount_lock); 814 } 815 EXPORT_SYMBOL(mnt_pin); 816 817 void mnt_unpin(struct vfsmount *mnt) 818 { 819 br_write_lock(vfsmount_lock); 820 if (mnt->mnt_pinned) { 821 mnt_inc_count(mnt); 822 mnt->mnt_pinned--; 823 } 824 br_write_unlock(vfsmount_lock); 825 } 826 EXPORT_SYMBOL(mnt_unpin); 827 828 static inline void mangle(struct seq_file *m, const char *s) 829 { 830 seq_escape(m, s, " \t\n\\"); 831 } 832 833 /* 834 * Simple .show_options callback for filesystems which don't want to 835 * implement more complex mount option showing. 836 * 837 * See also save_mount_options(). 838 */ 839 int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 840 { 841 const char *options; 842 843 rcu_read_lock(); 844 options = rcu_dereference(mnt->mnt_sb->s_options); 845 846 if (options != NULL && options[0]) { 847 seq_putc(m, ','); 848 mangle(m, options); 849 } 850 rcu_read_unlock(); 851 852 return 0; 853 } 854 EXPORT_SYMBOL(generic_show_options); 855 856 /* 857 * If filesystem uses generic_show_options(), this function should be 858 * called from the fill_super() callback. 859 * 860 * The .remount_fs callback usually needs to be handled in a special 861 * way, to make sure, that previous options are not overwritten if the 862 * remount fails. 863 * 864 * Also note, that if the filesystem's .remount_fs function doesn't 865 * reset all options to their default value, but changes only newly 866 * given options, then the displayed options will not reflect reality 867 * any more. 868 */ 869 void save_mount_options(struct super_block *sb, char *options) 870 { 871 BUG_ON(sb->s_options); 872 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 873 } 874 EXPORT_SYMBOL(save_mount_options); 875 876 void replace_mount_options(struct super_block *sb, char *options) 877 { 878 char *old = sb->s_options; 879 rcu_assign_pointer(sb->s_options, options); 880 if (old) { 881 synchronize_rcu(); 882 kfree(old); 883 } 884 } 885 EXPORT_SYMBOL(replace_mount_options); 886 887 #ifdef CONFIG_PROC_FS 888 /* iterator */ 889 static void *m_start(struct seq_file *m, loff_t *pos) 890 { 891 struct proc_mounts *p = m->private; 892 893 down_read(&namespace_sem); 894 return seq_list_start(&p->ns->list, *pos); 895 } 896 897 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 898 { 899 struct proc_mounts *p = m->private; 900 901 return seq_list_next(v, &p->ns->list, pos); 902 } 903 904 static void m_stop(struct seq_file *m, void *v) 905 { 906 up_read(&namespace_sem); 907 } 908 909 int mnt_had_events(struct proc_mounts *p) 910 { 911 struct mnt_namespace *ns = p->ns; 912 int res = 0; 913 914 br_read_lock(vfsmount_lock); 915 if (p->event != ns->event) { 916 p->event = ns->event; 917 res = 1; 918 } 919 br_read_unlock(vfsmount_lock); 920 921 return res; 922 } 923 924 struct proc_fs_info { 925 int flag; 926 const char *str; 927 }; 928 929 static int show_sb_opts(struct seq_file *m, struct super_block *sb) 930 { 931 static const struct proc_fs_info fs_info[] = { 932 { MS_SYNCHRONOUS, ",sync" }, 933 { MS_DIRSYNC, ",dirsync" }, 934 { MS_MANDLOCK, ",mand" }, 935 { 0, NULL } 936 }; 937 const struct proc_fs_info *fs_infop; 938 939 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 940 if (sb->s_flags & fs_infop->flag) 941 seq_puts(m, fs_infop->str); 942 } 943 944 return security_sb_show_options(m, sb); 945 } 946 947 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) 948 { 949 static const struct proc_fs_info mnt_info[] = { 950 { MNT_NOSUID, ",nosuid" }, 951 { MNT_NODEV, ",nodev" }, 952 { MNT_NOEXEC, ",noexec" }, 953 { MNT_NOATIME, ",noatime" }, 954 { MNT_NODIRATIME, ",nodiratime" }, 955 { MNT_RELATIME, ",relatime" }, 956 { 0, NULL } 957 }; 958 const struct proc_fs_info *fs_infop; 959 960 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { 961 if (mnt->mnt_flags & fs_infop->flag) 962 seq_puts(m, fs_infop->str); 963 } 964 } 965 966 static void show_type(struct seq_file *m, struct super_block *sb) 967 { 968 mangle(m, sb->s_type->name); 969 if (sb->s_subtype && sb->s_subtype[0]) { 970 seq_putc(m, '.'); 971 mangle(m, sb->s_subtype); 972 } 973 } 974 975 static int show_vfsmnt(struct seq_file *m, void *v) 976 { 977 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 978 int err = 0; 979 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 980 981 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 982 seq_putc(m, ' '); 983 seq_path(m, &mnt_path, " \t\n\\"); 984 seq_putc(m, ' '); 985 show_type(m, mnt->mnt_sb); 986 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); 987 err = show_sb_opts(m, mnt->mnt_sb); 988 if (err) 989 goto out; 990 show_mnt_opts(m, mnt); 991 if (mnt->mnt_sb->s_op->show_options) 992 err = mnt->mnt_sb->s_op->show_options(m, mnt); 993 seq_puts(m, " 0 0\n"); 994 out: 995 return err; 996 } 997 998 const struct seq_operations mounts_op = { 999 .start = m_start, 1000 .next = m_next, 1001 .stop = m_stop, 1002 .show = show_vfsmnt 1003 }; 1004 1005 static int show_mountinfo(struct seq_file *m, void *v) 1006 { 1007 struct proc_mounts *p = m->private; 1008 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 1009 struct super_block *sb = mnt->mnt_sb; 1010 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1011 struct path root = p->root; 1012 int err = 0; 1013 1014 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, 1015 MAJOR(sb->s_dev), MINOR(sb->s_dev)); 1016 seq_dentry(m, mnt->mnt_root, " \t\n\\"); 1017 seq_putc(m, ' '); 1018 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 1019 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 1020 /* 1021 * Mountpoint is outside root, discard that one. Ugly, 1022 * but less so than trying to do that in iterator in a 1023 * race-free way (due to renames). 1024 */ 1025 return SEQ_SKIP; 1026 } 1027 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); 1028 show_mnt_opts(m, mnt); 1029 1030 /* Tagged fields ("foo:X" or "bar") */ 1031 if (IS_MNT_SHARED(mnt)) 1032 seq_printf(m, " shared:%i", mnt->mnt_group_id); 1033 if (IS_MNT_SLAVE(mnt)) { 1034 int master = mnt->mnt_master->mnt_group_id; 1035 int dom = get_dominating_id(mnt, &p->root); 1036 seq_printf(m, " master:%i", master); 1037 if (dom && dom != master) 1038 seq_printf(m, " propagate_from:%i", dom); 1039 } 1040 if (IS_MNT_UNBINDABLE(mnt)) 1041 seq_puts(m, " unbindable"); 1042 1043 /* Filesystem specific data */ 1044 seq_puts(m, " - "); 1045 show_type(m, sb); 1046 seq_putc(m, ' '); 1047 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1048 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 1049 err = show_sb_opts(m, sb); 1050 if (err) 1051 goto out; 1052 if (sb->s_op->show_options) 1053 err = sb->s_op->show_options(m, mnt); 1054 seq_putc(m, '\n'); 1055 out: 1056 return err; 1057 } 1058 1059 const struct seq_operations mountinfo_op = { 1060 .start = m_start, 1061 .next = m_next, 1062 .stop = m_stop, 1063 .show = show_mountinfo, 1064 }; 1065 1066 static int show_vfsstat(struct seq_file *m, void *v) 1067 { 1068 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 1069 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1070 int err = 0; 1071 1072 /* device */ 1073 if (mnt->mnt_devname) { 1074 seq_puts(m, "device "); 1075 mangle(m, mnt->mnt_devname); 1076 } else 1077 seq_puts(m, "no device"); 1078 1079 /* mount point */ 1080 seq_puts(m, " mounted on "); 1081 seq_path(m, &mnt_path, " \t\n\\"); 1082 seq_putc(m, ' '); 1083 1084 /* file system type */ 1085 seq_puts(m, "with fstype "); 1086 show_type(m, mnt->mnt_sb); 1087 1088 /* optional statistics */ 1089 if (mnt->mnt_sb->s_op->show_stats) { 1090 seq_putc(m, ' '); 1091 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 1092 } 1093 1094 seq_putc(m, '\n'); 1095 return err; 1096 } 1097 1098 const struct seq_operations mountstats_op = { 1099 .start = m_start, 1100 .next = m_next, 1101 .stop = m_stop, 1102 .show = show_vfsstat, 1103 }; 1104 #endif /* CONFIG_PROC_FS */ 1105 1106 /** 1107 * may_umount_tree - check if a mount tree is busy 1108 * @mnt: root of mount tree 1109 * 1110 * This is called to check if a tree of mounts has any 1111 * open files, pwds, chroots or sub mounts that are 1112 * busy. 1113 */ 1114 int may_umount_tree(struct vfsmount *mnt) 1115 { 1116 int actual_refs = 0; 1117 int minimum_refs = 0; 1118 struct vfsmount *p; 1119 1120 /* write lock needed for mnt_get_count */ 1121 br_write_lock(vfsmount_lock); 1122 for (p = mnt; p; p = next_mnt(p, mnt)) { 1123 actual_refs += mnt_get_count(p); 1124 minimum_refs += 2; 1125 } 1126 br_write_unlock(vfsmount_lock); 1127 1128 if (actual_refs > minimum_refs) 1129 return 0; 1130 1131 return 1; 1132 } 1133 1134 EXPORT_SYMBOL(may_umount_tree); 1135 1136 /** 1137 * may_umount - check if a mount point is busy 1138 * @mnt: root of mount 1139 * 1140 * This is called to check if a mount point has any 1141 * open files, pwds, chroots or sub mounts. If the 1142 * mount has sub mounts this will return busy 1143 * regardless of whether the sub mounts are busy. 1144 * 1145 * Doesn't take quota and stuff into account. IOW, in some cases it will 1146 * give false negatives. The main reason why it's here is that we need 1147 * a non-destructive way to look for easily umountable filesystems. 1148 */ 1149 int may_umount(struct vfsmount *mnt) 1150 { 1151 int ret = 1; 1152 down_read(&namespace_sem); 1153 br_write_lock(vfsmount_lock); 1154 if (propagate_mount_busy(mnt, 2)) 1155 ret = 0; 1156 br_write_unlock(vfsmount_lock); 1157 up_read(&namespace_sem); 1158 return ret; 1159 } 1160 1161 EXPORT_SYMBOL(may_umount); 1162 1163 void release_mounts(struct list_head *head) 1164 { 1165 struct vfsmount *mnt; 1166 while (!list_empty(head)) { 1167 mnt = list_first_entry(head, struct vfsmount, mnt_hash); 1168 list_del_init(&mnt->mnt_hash); 1169 if (mnt->mnt_parent != mnt) { 1170 struct dentry *dentry; 1171 struct vfsmount *m; 1172 1173 br_write_lock(vfsmount_lock); 1174 dentry = mnt->mnt_mountpoint; 1175 m = mnt->mnt_parent; 1176 mnt->mnt_mountpoint = mnt->mnt_root; 1177 mnt->mnt_parent = mnt; 1178 m->mnt_ghosts--; 1179 br_write_unlock(vfsmount_lock); 1180 dput(dentry); 1181 mntput(m); 1182 } 1183 mntput(mnt); 1184 } 1185 } 1186 1187 /* 1188 * vfsmount lock must be held for write 1189 * namespace_sem must be held for write 1190 */ 1191 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1192 { 1193 LIST_HEAD(tmp_list); 1194 struct vfsmount *p; 1195 1196 for (p = mnt; p; p = next_mnt(p, mnt)) 1197 list_move(&p->mnt_hash, &tmp_list); 1198 1199 if (propagate) 1200 propagate_umount(&tmp_list); 1201 1202 list_for_each_entry(p, &tmp_list, mnt_hash) { 1203 list_del_init(&p->mnt_expire); 1204 list_del_init(&p->mnt_list); 1205 __touch_mnt_namespace(p->mnt_ns); 1206 p->mnt_ns = NULL; 1207 __mnt_make_shortterm(p); 1208 list_del_init(&p->mnt_child); 1209 if (p->mnt_parent != p) { 1210 p->mnt_parent->mnt_ghosts++; 1211 dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint); 1212 } 1213 change_mnt_propagation(p, MS_PRIVATE); 1214 } 1215 list_splice(&tmp_list, kill); 1216 } 1217 1218 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1219 1220 static int do_umount(struct vfsmount *mnt, int flags) 1221 { 1222 struct super_block *sb = mnt->mnt_sb; 1223 int retval; 1224 LIST_HEAD(umount_list); 1225 1226 retval = security_sb_umount(mnt, flags); 1227 if (retval) 1228 return retval; 1229 1230 /* 1231 * Allow userspace to request a mountpoint be expired rather than 1232 * unmounting unconditionally. Unmount only happens if: 1233 * (1) the mark is already set (the mark is cleared by mntput()) 1234 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1235 */ 1236 if (flags & MNT_EXPIRE) { 1237 if (mnt == current->fs->root.mnt || 1238 flags & (MNT_FORCE | MNT_DETACH)) 1239 return -EINVAL; 1240 1241 /* 1242 * probably don't strictly need the lock here if we examined 1243 * all race cases, but it's a slowpath. 1244 */ 1245 br_write_lock(vfsmount_lock); 1246 if (mnt_get_count(mnt) != 2) { 1247 br_write_lock(vfsmount_lock); 1248 return -EBUSY; 1249 } 1250 br_write_unlock(vfsmount_lock); 1251 1252 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1253 return -EAGAIN; 1254 } 1255 1256 /* 1257 * If we may have to abort operations to get out of this 1258 * mount, and they will themselves hold resources we must 1259 * allow the fs to do things. In the Unix tradition of 1260 * 'Gee thats tricky lets do it in userspace' the umount_begin 1261 * might fail to complete on the first run through as other tasks 1262 * must return, and the like. Thats for the mount program to worry 1263 * about for the moment. 1264 */ 1265 1266 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1267 sb->s_op->umount_begin(sb); 1268 } 1269 1270 /* 1271 * No sense to grab the lock for this test, but test itself looks 1272 * somewhat bogus. Suggestions for better replacement? 1273 * Ho-hum... In principle, we might treat that as umount + switch 1274 * to rootfs. GC would eventually take care of the old vfsmount. 1275 * Actually it makes sense, especially if rootfs would contain a 1276 * /reboot - static binary that would close all descriptors and 1277 * call reboot(9). Then init(8) could umount root and exec /reboot. 1278 */ 1279 if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1280 /* 1281 * Special case for "unmounting" root ... 1282 * we just try to remount it readonly. 1283 */ 1284 down_write(&sb->s_umount); 1285 if (!(sb->s_flags & MS_RDONLY)) 1286 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1287 up_write(&sb->s_umount); 1288 return retval; 1289 } 1290 1291 down_write(&namespace_sem); 1292 br_write_lock(vfsmount_lock); 1293 event++; 1294 1295 if (!(flags & MNT_DETACH)) 1296 shrink_submounts(mnt, &umount_list); 1297 1298 retval = -EBUSY; 1299 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 1300 if (!list_empty(&mnt->mnt_list)) 1301 umount_tree(mnt, 1, &umount_list); 1302 retval = 0; 1303 } 1304 br_write_unlock(vfsmount_lock); 1305 up_write(&namespace_sem); 1306 release_mounts(&umount_list); 1307 return retval; 1308 } 1309 1310 /* 1311 * Now umount can handle mount points as well as block devices. 1312 * This is important for filesystems which use unnamed block devices. 1313 * 1314 * We now support a flag for forced unmount like the other 'big iron' 1315 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1316 */ 1317 1318 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1319 { 1320 struct path path; 1321 int retval; 1322 int lookup_flags = 0; 1323 1324 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1325 return -EINVAL; 1326 1327 if (!(flags & UMOUNT_NOFOLLOW)) 1328 lookup_flags |= LOOKUP_FOLLOW; 1329 1330 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); 1331 if (retval) 1332 goto out; 1333 retval = -EINVAL; 1334 if (path.dentry != path.mnt->mnt_root) 1335 goto dput_and_out; 1336 if (!check_mnt(path.mnt)) 1337 goto dput_and_out; 1338 1339 retval = -EPERM; 1340 if (!capable(CAP_SYS_ADMIN)) 1341 goto dput_and_out; 1342 1343 retval = do_umount(path.mnt, flags); 1344 dput_and_out: 1345 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1346 dput(path.dentry); 1347 mntput_no_expire(path.mnt); 1348 out: 1349 return retval; 1350 } 1351 1352 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1353 1354 /* 1355 * The 2.0 compatible umount. No flags. 1356 */ 1357 SYSCALL_DEFINE1(oldumount, char __user *, name) 1358 { 1359 return sys_umount(name, 0); 1360 } 1361 1362 #endif 1363 1364 static int mount_is_safe(struct path *path) 1365 { 1366 if (capable(CAP_SYS_ADMIN)) 1367 return 0; 1368 return -EPERM; 1369 #ifdef notyet 1370 if (S_ISLNK(path->dentry->d_inode->i_mode)) 1371 return -EPERM; 1372 if (path->dentry->d_inode->i_mode & S_ISVTX) { 1373 if (current_uid() != path->dentry->d_inode->i_uid) 1374 return -EPERM; 1375 } 1376 if (inode_permission(path->dentry->d_inode, MAY_WRITE)) 1377 return -EPERM; 1378 return 0; 1379 #endif 1380 } 1381 1382 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, 1383 int flag) 1384 { 1385 struct vfsmount *res, *p, *q, *r, *s; 1386 struct path path; 1387 1388 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 1389 return NULL; 1390 1391 res = q = clone_mnt(mnt, dentry, flag); 1392 if (!q) 1393 goto Enomem; 1394 q->mnt_mountpoint = mnt->mnt_mountpoint; 1395 1396 p = mnt; 1397 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1398 if (!is_subdir(r->mnt_mountpoint, dentry)) 1399 continue; 1400 1401 for (s = r; s; s = next_mnt(s, r)) { 1402 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 1403 s = skip_mnt_tree(s); 1404 continue; 1405 } 1406 while (p != s->mnt_parent) { 1407 p = p->mnt_parent; 1408 q = q->mnt_parent; 1409 } 1410 p = s; 1411 path.mnt = q; 1412 path.dentry = p->mnt_mountpoint; 1413 q = clone_mnt(p, p->mnt_root, flag); 1414 if (!q) 1415 goto Enomem; 1416 br_write_lock(vfsmount_lock); 1417 list_add_tail(&q->mnt_list, &res->mnt_list); 1418 attach_mnt(q, &path); 1419 br_write_unlock(vfsmount_lock); 1420 } 1421 } 1422 return res; 1423 Enomem: 1424 if (res) { 1425 LIST_HEAD(umount_list); 1426 br_write_lock(vfsmount_lock); 1427 umount_tree(res, 0, &umount_list); 1428 br_write_unlock(vfsmount_lock); 1429 release_mounts(&umount_list); 1430 } 1431 return NULL; 1432 } 1433 1434 struct vfsmount *collect_mounts(struct path *path) 1435 { 1436 struct vfsmount *tree; 1437 down_write(&namespace_sem); 1438 tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); 1439 up_write(&namespace_sem); 1440 return tree; 1441 } 1442 1443 void drop_collected_mounts(struct vfsmount *mnt) 1444 { 1445 LIST_HEAD(umount_list); 1446 down_write(&namespace_sem); 1447 br_write_lock(vfsmount_lock); 1448 umount_tree(mnt, 0, &umount_list); 1449 br_write_unlock(vfsmount_lock); 1450 up_write(&namespace_sem); 1451 release_mounts(&umount_list); 1452 } 1453 1454 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1455 struct vfsmount *root) 1456 { 1457 struct vfsmount *mnt; 1458 int res = f(root, arg); 1459 if (res) 1460 return res; 1461 list_for_each_entry(mnt, &root->mnt_list, mnt_list) { 1462 res = f(mnt, arg); 1463 if (res) 1464 return res; 1465 } 1466 return 0; 1467 } 1468 1469 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1470 { 1471 struct vfsmount *p; 1472 1473 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1474 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1475 mnt_release_group_id(p); 1476 } 1477 } 1478 1479 static int invent_group_ids(struct vfsmount *mnt, bool recurse) 1480 { 1481 struct vfsmount *p; 1482 1483 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1484 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1485 int err = mnt_alloc_group_id(p); 1486 if (err) { 1487 cleanup_group_ids(mnt, p); 1488 return err; 1489 } 1490 } 1491 } 1492 1493 return 0; 1494 } 1495 1496 /* 1497 * @source_mnt : mount tree to be attached 1498 * @nd : place the mount tree @source_mnt is attached 1499 * @parent_nd : if non-null, detach the source_mnt from its parent and 1500 * store the parent mount and mountpoint dentry. 1501 * (done when source_mnt is moved) 1502 * 1503 * NOTE: in the table below explains the semantics when a source mount 1504 * of a given type is attached to a destination mount of a given type. 1505 * --------------------------------------------------------------------------- 1506 * | BIND MOUNT OPERATION | 1507 * |************************************************************************** 1508 * | source-->| shared | private | slave | unbindable | 1509 * | dest | | | | | 1510 * | | | | | | | 1511 * | v | | | | | 1512 * |************************************************************************** 1513 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1514 * | | | | | | 1515 * |non-shared| shared (+) | private | slave (*) | invalid | 1516 * *************************************************************************** 1517 * A bind operation clones the source mount and mounts the clone on the 1518 * destination mount. 1519 * 1520 * (++) the cloned mount is propagated to all the mounts in the propagation 1521 * tree of the destination mount and the cloned mount is added to 1522 * the peer group of the source mount. 1523 * (+) the cloned mount is created under the destination mount and is marked 1524 * as shared. The cloned mount is added to the peer group of the source 1525 * mount. 1526 * (+++) the mount is propagated to all the mounts in the propagation tree 1527 * of the destination mount and the cloned mount is made slave 1528 * of the same master as that of the source mount. The cloned mount 1529 * is marked as 'shared and slave'. 1530 * (*) the cloned mount is made a slave of the same master as that of the 1531 * source mount. 1532 * 1533 * --------------------------------------------------------------------------- 1534 * | MOVE MOUNT OPERATION | 1535 * |************************************************************************** 1536 * | source-->| shared | private | slave | unbindable | 1537 * | dest | | | | | 1538 * | | | | | | | 1539 * | v | | | | | 1540 * |************************************************************************** 1541 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1542 * | | | | | | 1543 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1544 * *************************************************************************** 1545 * 1546 * (+) the mount is moved to the destination. And is then propagated to 1547 * all the mounts in the propagation tree of the destination mount. 1548 * (+*) the mount is moved to the destination. 1549 * (+++) the mount is moved to the destination and is then propagated to 1550 * all the mounts belonging to the destination mount's propagation tree. 1551 * the mount is marked as 'shared and slave'. 1552 * (*) the mount continues to be a slave at the new location. 1553 * 1554 * if the source mount is a tree, the operations explained above is 1555 * applied to each mount in the tree. 1556 * Must be called without spinlocks held, since this function can sleep 1557 * in allocations. 1558 */ 1559 static int attach_recursive_mnt(struct vfsmount *source_mnt, 1560 struct path *path, struct path *parent_path) 1561 { 1562 LIST_HEAD(tree_list); 1563 struct vfsmount *dest_mnt = path->mnt; 1564 struct dentry *dest_dentry = path->dentry; 1565 struct vfsmount *child, *p; 1566 int err; 1567 1568 if (IS_MNT_SHARED(dest_mnt)) { 1569 err = invent_group_ids(source_mnt, true); 1570 if (err) 1571 goto out; 1572 } 1573 err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); 1574 if (err) 1575 goto out_cleanup_ids; 1576 1577 br_write_lock(vfsmount_lock); 1578 1579 if (IS_MNT_SHARED(dest_mnt)) { 1580 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1581 set_mnt_shared(p); 1582 } 1583 if (parent_path) { 1584 detach_mnt(source_mnt, parent_path); 1585 attach_mnt(source_mnt, path); 1586 touch_mnt_namespace(parent_path->mnt->mnt_ns); 1587 } else { 1588 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1589 commit_tree(source_mnt); 1590 } 1591 1592 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 1593 list_del_init(&child->mnt_hash); 1594 commit_tree(child); 1595 } 1596 br_write_unlock(vfsmount_lock); 1597 1598 return 0; 1599 1600 out_cleanup_ids: 1601 if (IS_MNT_SHARED(dest_mnt)) 1602 cleanup_group_ids(source_mnt, NULL); 1603 out: 1604 return err; 1605 } 1606 1607 static int graft_tree(struct vfsmount *mnt, struct path *path) 1608 { 1609 int err; 1610 if (mnt->mnt_sb->s_flags & MS_NOUSER) 1611 return -EINVAL; 1612 1613 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1614 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 1615 return -ENOTDIR; 1616 1617 err = -ENOENT; 1618 mutex_lock(&path->dentry->d_inode->i_mutex); 1619 if (cant_mount(path->dentry)) 1620 goto out_unlock; 1621 1622 if (!d_unlinked(path->dentry)) 1623 err = attach_recursive_mnt(mnt, path, NULL); 1624 out_unlock: 1625 mutex_unlock(&path->dentry->d_inode->i_mutex); 1626 return err; 1627 } 1628 1629 /* 1630 * Sanity check the flags to change_mnt_propagation. 1631 */ 1632 1633 static int flags_to_propagation_type(int flags) 1634 { 1635 int type = flags & ~MS_REC; 1636 1637 /* Fail if any non-propagation flags are set */ 1638 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1639 return 0; 1640 /* Only one propagation flag should be set */ 1641 if (!is_power_of_2(type)) 1642 return 0; 1643 return type; 1644 } 1645 1646 /* 1647 * recursively change the type of the mountpoint. 1648 */ 1649 static int do_change_type(struct path *path, int flag) 1650 { 1651 struct vfsmount *m, *mnt = path->mnt; 1652 int recurse = flag & MS_REC; 1653 int type; 1654 int err = 0; 1655 1656 if (!capable(CAP_SYS_ADMIN)) 1657 return -EPERM; 1658 1659 if (path->dentry != path->mnt->mnt_root) 1660 return -EINVAL; 1661 1662 type = flags_to_propagation_type(flag); 1663 if (!type) 1664 return -EINVAL; 1665 1666 down_write(&namespace_sem); 1667 if (type == MS_SHARED) { 1668 err = invent_group_ids(mnt, recurse); 1669 if (err) 1670 goto out_unlock; 1671 } 1672 1673 br_write_lock(vfsmount_lock); 1674 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1675 change_mnt_propagation(m, type); 1676 br_write_unlock(vfsmount_lock); 1677 1678 out_unlock: 1679 up_write(&namespace_sem); 1680 return err; 1681 } 1682 1683 /* 1684 * do loopback mount. 1685 */ 1686 static int do_loopback(struct path *path, char *old_name, 1687 int recurse) 1688 { 1689 struct path old_path; 1690 struct vfsmount *mnt = NULL; 1691 int err = mount_is_safe(path); 1692 if (err) 1693 return err; 1694 if (!old_name || !*old_name) 1695 return -EINVAL; 1696 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1697 if (err) 1698 return err; 1699 1700 down_write(&namespace_sem); 1701 err = -EINVAL; 1702 if (IS_MNT_UNBINDABLE(old_path.mnt)) 1703 goto out; 1704 1705 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1706 goto out; 1707 1708 err = -ENOMEM; 1709 if (recurse) 1710 mnt = copy_tree(old_path.mnt, old_path.dentry, 0); 1711 else 1712 mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); 1713 1714 if (!mnt) 1715 goto out; 1716 1717 err = graft_tree(mnt, path); 1718 if (err) { 1719 LIST_HEAD(umount_list); 1720 1721 br_write_lock(vfsmount_lock); 1722 umount_tree(mnt, 0, &umount_list); 1723 br_write_unlock(vfsmount_lock); 1724 release_mounts(&umount_list); 1725 } 1726 1727 out: 1728 up_write(&namespace_sem); 1729 path_put(&old_path); 1730 return err; 1731 } 1732 1733 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 1734 { 1735 int error = 0; 1736 int readonly_request = 0; 1737 1738 if (ms_flags & MS_RDONLY) 1739 readonly_request = 1; 1740 if (readonly_request == __mnt_is_readonly(mnt)) 1741 return 0; 1742 1743 if (readonly_request) 1744 error = mnt_make_readonly(mnt); 1745 else 1746 __mnt_unmake_readonly(mnt); 1747 return error; 1748 } 1749 1750 /* 1751 * change filesystem flags. dir should be a physical root of filesystem. 1752 * If you've mounted a non-root directory somewhere and want to do remount 1753 * on it - tough luck. 1754 */ 1755 static int do_remount(struct path *path, int flags, int mnt_flags, 1756 void *data) 1757 { 1758 int err; 1759 struct super_block *sb = path->mnt->mnt_sb; 1760 1761 if (!capable(CAP_SYS_ADMIN)) 1762 return -EPERM; 1763 1764 if (!check_mnt(path->mnt)) 1765 return -EINVAL; 1766 1767 if (path->dentry != path->mnt->mnt_root) 1768 return -EINVAL; 1769 1770 down_write(&sb->s_umount); 1771 if (flags & MS_BIND) 1772 err = change_mount_flags(path->mnt, flags); 1773 else 1774 err = do_remount_sb(sb, flags, data, 0); 1775 if (!err) { 1776 br_write_lock(vfsmount_lock); 1777 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; 1778 path->mnt->mnt_flags = mnt_flags; 1779 br_write_unlock(vfsmount_lock); 1780 } 1781 up_write(&sb->s_umount); 1782 if (!err) { 1783 br_write_lock(vfsmount_lock); 1784 touch_mnt_namespace(path->mnt->mnt_ns); 1785 br_write_unlock(vfsmount_lock); 1786 } 1787 return err; 1788 } 1789 1790 static inline int tree_contains_unbindable(struct vfsmount *mnt) 1791 { 1792 struct vfsmount *p; 1793 for (p = mnt; p; p = next_mnt(p, mnt)) { 1794 if (IS_MNT_UNBINDABLE(p)) 1795 return 1; 1796 } 1797 return 0; 1798 } 1799 1800 static int do_move_mount(struct path *path, char *old_name) 1801 { 1802 struct path old_path, parent_path; 1803 struct vfsmount *p; 1804 int err = 0; 1805 if (!capable(CAP_SYS_ADMIN)) 1806 return -EPERM; 1807 if (!old_name || !*old_name) 1808 return -EINVAL; 1809 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1810 if (err) 1811 return err; 1812 1813 down_write(&namespace_sem); 1814 err = follow_down(path, true); 1815 if (err < 0) 1816 goto out; 1817 1818 err = -EINVAL; 1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1820 goto out; 1821 1822 err = -ENOENT; 1823 mutex_lock(&path->dentry->d_inode->i_mutex); 1824 if (cant_mount(path->dentry)) 1825 goto out1; 1826 1827 if (d_unlinked(path->dentry)) 1828 goto out1; 1829 1830 err = -EINVAL; 1831 if (old_path.dentry != old_path.mnt->mnt_root) 1832 goto out1; 1833 1834 if (old_path.mnt == old_path.mnt->mnt_parent) 1835 goto out1; 1836 1837 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1838 S_ISDIR(old_path.dentry->d_inode->i_mode)) 1839 goto out1; 1840 /* 1841 * Don't move a mount residing in a shared parent. 1842 */ 1843 if (old_path.mnt->mnt_parent && 1844 IS_MNT_SHARED(old_path.mnt->mnt_parent)) 1845 goto out1; 1846 /* 1847 * Don't move a mount tree containing unbindable mounts to a destination 1848 * mount which is shared. 1849 */ 1850 if (IS_MNT_SHARED(path->mnt) && 1851 tree_contains_unbindable(old_path.mnt)) 1852 goto out1; 1853 err = -ELOOP; 1854 for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) 1855 if (p == old_path.mnt) 1856 goto out1; 1857 1858 err = attach_recursive_mnt(old_path.mnt, path, &parent_path); 1859 if (err) 1860 goto out1; 1861 1862 /* if the mount is moved, it should no longer be expire 1863 * automatically */ 1864 list_del_init(&old_path.mnt->mnt_expire); 1865 out1: 1866 mutex_unlock(&path->dentry->d_inode->i_mutex); 1867 out: 1868 up_write(&namespace_sem); 1869 if (!err) 1870 path_put(&parent_path); 1871 path_put(&old_path); 1872 return err; 1873 } 1874 1875 static int do_add_mount(struct vfsmount *, struct path *, int); 1876 1877 /* 1878 * create a new mount for userspace and request it to be added into the 1879 * namespace's tree 1880 */ 1881 static int do_new_mount(struct path *path, char *type, int flags, 1882 int mnt_flags, char *name, void *data) 1883 { 1884 struct vfsmount *mnt; 1885 int err; 1886 1887 if (!type) 1888 return -EINVAL; 1889 1890 /* we need capabilities... */ 1891 if (!capable(CAP_SYS_ADMIN)) 1892 return -EPERM; 1893 1894 mnt = do_kern_mount(type, flags, name, data); 1895 if (IS_ERR(mnt)) 1896 return PTR_ERR(mnt); 1897 1898 err = do_add_mount(mnt, path, mnt_flags); 1899 if (err) 1900 mntput(mnt); 1901 return err; 1902 } 1903 1904 int finish_automount(struct vfsmount *m, struct path *path) 1905 { 1906 int err; 1907 /* The new mount record should have at least 2 refs to prevent it being 1908 * expired before we get a chance to add it 1909 */ 1910 BUG_ON(mnt_get_count(m) < 2); 1911 1912 if (m->mnt_sb == path->mnt->mnt_sb && 1913 m->mnt_root == path->dentry) { 1914 err = -ELOOP; 1915 goto fail; 1916 } 1917 1918 err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE); 1919 if (!err) 1920 return 0; 1921 fail: 1922 /* remove m from any expiration list it may be on */ 1923 if (!list_empty(&m->mnt_expire)) { 1924 down_write(&namespace_sem); 1925 br_write_lock(vfsmount_lock); 1926 list_del_init(&m->mnt_expire); 1927 br_write_unlock(vfsmount_lock); 1928 up_write(&namespace_sem); 1929 } 1930 mntput(m); 1931 mntput(m); 1932 return err; 1933 } 1934 1935 /* 1936 * add a mount into a namespace's mount tree 1937 */ 1938 static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) 1939 { 1940 int err; 1941 1942 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 1943 1944 down_write(&namespace_sem); 1945 /* Something was mounted here while we slept */ 1946 err = follow_down(path, true); 1947 if (err < 0) 1948 goto unlock; 1949 1950 err = -EINVAL; 1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) 1952 goto unlock; 1953 1954 /* Refuse the same filesystem on the same mount point */ 1955 err = -EBUSY; 1956 if (path->mnt->mnt_sb == newmnt->mnt_sb && 1957 path->mnt->mnt_root == path->dentry) 1958 goto unlock; 1959 1960 err = -EINVAL; 1961 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) 1962 goto unlock; 1963 1964 newmnt->mnt_flags = mnt_flags; 1965 err = graft_tree(newmnt, path); 1966 1967 unlock: 1968 up_write(&namespace_sem); 1969 return err; 1970 } 1971 1972 /** 1973 * mnt_set_expiry - Put a mount on an expiration list 1974 * @mnt: The mount to list. 1975 * @expiry_list: The list to add the mount to. 1976 */ 1977 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 1978 { 1979 down_write(&namespace_sem); 1980 br_write_lock(vfsmount_lock); 1981 1982 list_add_tail(&mnt->mnt_expire, expiry_list); 1983 1984 br_write_unlock(vfsmount_lock); 1985 up_write(&namespace_sem); 1986 } 1987 EXPORT_SYMBOL(mnt_set_expiry); 1988 1989 /* 1990 * process a list of expirable mountpoints with the intent of discarding any 1991 * mountpoints that aren't in use and haven't been touched since last we came 1992 * here 1993 */ 1994 void mark_mounts_for_expiry(struct list_head *mounts) 1995 { 1996 struct vfsmount *mnt, *next; 1997 LIST_HEAD(graveyard); 1998 LIST_HEAD(umounts); 1999 2000 if (list_empty(mounts)) 2001 return; 2002 2003 down_write(&namespace_sem); 2004 br_write_lock(vfsmount_lock); 2005 2006 /* extract from the expiration list every vfsmount that matches the 2007 * following criteria: 2008 * - only referenced by its parent vfsmount 2009 * - still marked for expiry (marked on the last call here; marks are 2010 * cleared by mntput()) 2011 */ 2012 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2013 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2014 propagate_mount_busy(mnt, 1)) 2015 continue; 2016 list_move(&mnt->mnt_expire, &graveyard); 2017 } 2018 while (!list_empty(&graveyard)) { 2019 mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); 2020 touch_mnt_namespace(mnt->mnt_ns); 2021 umount_tree(mnt, 1, &umounts); 2022 } 2023 br_write_unlock(vfsmount_lock); 2024 up_write(&namespace_sem); 2025 2026 release_mounts(&umounts); 2027 } 2028 2029 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2030 2031 /* 2032 * Ripoff of 'select_parent()' 2033 * 2034 * search the list of submounts for a given mountpoint, and move any 2035 * shrinkable submounts to the 'graveyard' list. 2036 */ 2037 static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) 2038 { 2039 struct vfsmount *this_parent = parent; 2040 struct list_head *next; 2041 int found = 0; 2042 2043 repeat: 2044 next = this_parent->mnt_mounts.next; 2045 resume: 2046 while (next != &this_parent->mnt_mounts) { 2047 struct list_head *tmp = next; 2048 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); 2049 2050 next = tmp->next; 2051 if (!(mnt->mnt_flags & MNT_SHRINKABLE)) 2052 continue; 2053 /* 2054 * Descend a level if the d_mounts list is non-empty. 2055 */ 2056 if (!list_empty(&mnt->mnt_mounts)) { 2057 this_parent = mnt; 2058 goto repeat; 2059 } 2060 2061 if (!propagate_mount_busy(mnt, 1)) { 2062 list_move_tail(&mnt->mnt_expire, graveyard); 2063 found++; 2064 } 2065 } 2066 /* 2067 * All done at this level ... ascend and resume the search 2068 */ 2069 if (this_parent != parent) { 2070 next = this_parent->mnt_child.next; 2071 this_parent = this_parent->mnt_parent; 2072 goto resume; 2073 } 2074 return found; 2075 } 2076 2077 /* 2078 * process a list of expirable mountpoints with the intent of discarding any 2079 * submounts of a specific parent mountpoint 2080 * 2081 * vfsmount_lock must be held for write 2082 */ 2083 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 2084 { 2085 LIST_HEAD(graveyard); 2086 struct vfsmount *m; 2087 2088 /* extract submounts of 'mountpoint' from the expiration list */ 2089 while (select_submounts(mnt, &graveyard)) { 2090 while (!list_empty(&graveyard)) { 2091 m = list_first_entry(&graveyard, struct vfsmount, 2092 mnt_expire); 2093 touch_mnt_namespace(m->mnt_ns); 2094 umount_tree(m, 1, umounts); 2095 } 2096 } 2097 } 2098 2099 /* 2100 * Some copy_from_user() implementations do not return the exact number of 2101 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2102 * Note that this function differs from copy_from_user() in that it will oops 2103 * on bad values of `to', rather than returning a short copy. 2104 */ 2105 static long exact_copy_from_user(void *to, const void __user * from, 2106 unsigned long n) 2107 { 2108 char *t = to; 2109 const char __user *f = from; 2110 char c; 2111 2112 if (!access_ok(VERIFY_READ, from, n)) 2113 return n; 2114 2115 while (n) { 2116 if (__get_user(c, f)) { 2117 memset(t, 0, n); 2118 break; 2119 } 2120 *t++ = c; 2121 f++; 2122 n--; 2123 } 2124 return n; 2125 } 2126 2127 int copy_mount_options(const void __user * data, unsigned long *where) 2128 { 2129 int i; 2130 unsigned long page; 2131 unsigned long size; 2132 2133 *where = 0; 2134 if (!data) 2135 return 0; 2136 2137 if (!(page = __get_free_page(GFP_KERNEL))) 2138 return -ENOMEM; 2139 2140 /* We only care that *some* data at the address the user 2141 * gave us is valid. Just in case, we'll zero 2142 * the remainder of the page. 2143 */ 2144 /* copy_from_user cannot cross TASK_SIZE ! */ 2145 size = TASK_SIZE - (unsigned long)data; 2146 if (size > PAGE_SIZE) 2147 size = PAGE_SIZE; 2148 2149 i = size - exact_copy_from_user((void *)page, data, size); 2150 if (!i) { 2151 free_page(page); 2152 return -EFAULT; 2153 } 2154 if (i != PAGE_SIZE) 2155 memset((char *)page + i, 0, PAGE_SIZE - i); 2156 *where = page; 2157 return 0; 2158 } 2159 2160 int copy_mount_string(const void __user *data, char **where) 2161 { 2162 char *tmp; 2163 2164 if (!data) { 2165 *where = NULL; 2166 return 0; 2167 } 2168 2169 tmp = strndup_user(data, PAGE_SIZE); 2170 if (IS_ERR(tmp)) 2171 return PTR_ERR(tmp); 2172 2173 *where = tmp; 2174 return 0; 2175 } 2176 2177 /* 2178 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2179 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2180 * 2181 * data is a (void *) that can point to any structure up to 2182 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2183 * information (or be NULL). 2184 * 2185 * Pre-0.97 versions of mount() didn't have a flags word. 2186 * When the flags word was introduced its top half was required 2187 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2188 * Therefore, if this magic number is present, it carries no information 2189 * and must be discarded. 2190 */ 2191 long do_mount(char *dev_name, char *dir_name, char *type_page, 2192 unsigned long flags, void *data_page) 2193 { 2194 struct path path; 2195 int retval = 0; 2196 int mnt_flags = 0; 2197 2198 /* Discard magic */ 2199 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2200 flags &= ~MS_MGC_MSK; 2201 2202 /* Basic sanity checks */ 2203 2204 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 2205 return -EINVAL; 2206 2207 if (data_page) 2208 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2209 2210 /* ... and get the mountpoint */ 2211 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 2212 if (retval) 2213 return retval; 2214 2215 retval = security_sb_mount(dev_name, &path, 2216 type_page, flags, data_page); 2217 if (retval) 2218 goto dput_out; 2219 2220 /* Default to relatime unless overriden */ 2221 if (!(flags & MS_NOATIME)) 2222 mnt_flags |= MNT_RELATIME; 2223 2224 /* Separate the per-mountpoint flags */ 2225 if (flags & MS_NOSUID) 2226 mnt_flags |= MNT_NOSUID; 2227 if (flags & MS_NODEV) 2228 mnt_flags |= MNT_NODEV; 2229 if (flags & MS_NOEXEC) 2230 mnt_flags |= MNT_NOEXEC; 2231 if (flags & MS_NOATIME) 2232 mnt_flags |= MNT_NOATIME; 2233 if (flags & MS_NODIRATIME) 2234 mnt_flags |= MNT_NODIRATIME; 2235 if (flags & MS_STRICTATIME) 2236 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2237 if (flags & MS_RDONLY) 2238 mnt_flags |= MNT_READONLY; 2239 2240 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2241 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2242 MS_STRICTATIME); 2243 2244 if (flags & MS_REMOUNT) 2245 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2246 data_page); 2247 else if (flags & MS_BIND) 2248 retval = do_loopback(&path, dev_name, flags & MS_REC); 2249 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2250 retval = do_change_type(&path, flags); 2251 else if (flags & MS_MOVE) 2252 retval = do_move_mount(&path, dev_name); 2253 else 2254 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2255 dev_name, data_page); 2256 dput_out: 2257 path_put(&path); 2258 return retval; 2259 } 2260 2261 static struct mnt_namespace *alloc_mnt_ns(void) 2262 { 2263 struct mnt_namespace *new_ns; 2264 2265 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2266 if (!new_ns) 2267 return ERR_PTR(-ENOMEM); 2268 atomic_set(&new_ns->count, 1); 2269 new_ns->root = NULL; 2270 INIT_LIST_HEAD(&new_ns->list); 2271 init_waitqueue_head(&new_ns->poll); 2272 new_ns->event = 0; 2273 return new_ns; 2274 } 2275 2276 void mnt_make_longterm(struct vfsmount *mnt) 2277 { 2278 __mnt_make_longterm(mnt); 2279 } 2280 2281 void mnt_make_shortterm(struct vfsmount *mnt) 2282 { 2283 #ifdef CONFIG_SMP 2284 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) 2285 return; 2286 br_write_lock(vfsmount_lock); 2287 atomic_dec(&mnt->mnt_longterm); 2288 br_write_unlock(vfsmount_lock); 2289 #endif 2290 } 2291 2292 /* 2293 * Allocate a new namespace structure and populate it with contents 2294 * copied from the namespace of the passed in task structure. 2295 */ 2296 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2297 struct fs_struct *fs) 2298 { 2299 struct mnt_namespace *new_ns; 2300 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2301 struct vfsmount *p, *q; 2302 2303 new_ns = alloc_mnt_ns(); 2304 if (IS_ERR(new_ns)) 2305 return new_ns; 2306 2307 down_write(&namespace_sem); 2308 /* First pass: copy the tree topology */ 2309 new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, 2310 CL_COPY_ALL | CL_EXPIRE); 2311 if (!new_ns->root) { 2312 up_write(&namespace_sem); 2313 kfree(new_ns); 2314 return ERR_PTR(-ENOMEM); 2315 } 2316 br_write_lock(vfsmount_lock); 2317 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 2318 br_write_unlock(vfsmount_lock); 2319 2320 /* 2321 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2322 * as belonging to new namespace. We have already acquired a private 2323 * fs_struct, so tsk->fs->lock is not needed. 2324 */ 2325 p = mnt_ns->root; 2326 q = new_ns->root; 2327 while (p) { 2328 q->mnt_ns = new_ns; 2329 __mnt_make_longterm(q); 2330 if (fs) { 2331 if (p == fs->root.mnt) { 2332 fs->root.mnt = mntget(q); 2333 __mnt_make_longterm(q); 2334 mnt_make_shortterm(p); 2335 rootmnt = p; 2336 } 2337 if (p == fs->pwd.mnt) { 2338 fs->pwd.mnt = mntget(q); 2339 __mnt_make_longterm(q); 2340 mnt_make_shortterm(p); 2341 pwdmnt = p; 2342 } 2343 } 2344 p = next_mnt(p, mnt_ns->root); 2345 q = next_mnt(q, new_ns->root); 2346 } 2347 up_write(&namespace_sem); 2348 2349 if (rootmnt) 2350 mntput(rootmnt); 2351 if (pwdmnt) 2352 mntput(pwdmnt); 2353 2354 return new_ns; 2355 } 2356 2357 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2358 struct fs_struct *new_fs) 2359 { 2360 struct mnt_namespace *new_ns; 2361 2362 BUG_ON(!ns); 2363 get_mnt_ns(ns); 2364 2365 if (!(flags & CLONE_NEWNS)) 2366 return ns; 2367 2368 new_ns = dup_mnt_ns(ns, new_fs); 2369 2370 put_mnt_ns(ns); 2371 return new_ns; 2372 } 2373 2374 /** 2375 * create_mnt_ns - creates a private namespace and adds a root filesystem 2376 * @mnt: pointer to the new root filesystem mountpoint 2377 */ 2378 struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) 2379 { 2380 struct mnt_namespace *new_ns; 2381 2382 new_ns = alloc_mnt_ns(); 2383 if (!IS_ERR(new_ns)) { 2384 mnt->mnt_ns = new_ns; 2385 __mnt_make_longterm(mnt); 2386 new_ns->root = mnt; 2387 list_add(&new_ns->list, &new_ns->root->mnt_list); 2388 } 2389 return new_ns; 2390 } 2391 EXPORT_SYMBOL(create_mnt_ns); 2392 2393 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2394 char __user *, type, unsigned long, flags, void __user *, data) 2395 { 2396 int ret; 2397 char *kernel_type; 2398 char *kernel_dir; 2399 char *kernel_dev; 2400 unsigned long data_page; 2401 2402 ret = copy_mount_string(type, &kernel_type); 2403 if (ret < 0) 2404 goto out_type; 2405 2406 kernel_dir = getname(dir_name); 2407 if (IS_ERR(kernel_dir)) { 2408 ret = PTR_ERR(kernel_dir); 2409 goto out_dir; 2410 } 2411 2412 ret = copy_mount_string(dev_name, &kernel_dev); 2413 if (ret < 0) 2414 goto out_dev; 2415 2416 ret = copy_mount_options(data, &data_page); 2417 if (ret < 0) 2418 goto out_data; 2419 2420 ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, 2421 (void *) data_page); 2422 2423 free_page(data_page); 2424 out_data: 2425 kfree(kernel_dev); 2426 out_dev: 2427 putname(kernel_dir); 2428 out_dir: 2429 kfree(kernel_type); 2430 out_type: 2431 return ret; 2432 } 2433 2434 /* 2435 * pivot_root Semantics: 2436 * Moves the root file system of the current process to the directory put_old, 2437 * makes new_root as the new root file system of the current process, and sets 2438 * root/cwd of all processes which had them on the current root to new_root. 2439 * 2440 * Restrictions: 2441 * The new_root and put_old must be directories, and must not be on the 2442 * same file system as the current process root. The put_old must be 2443 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2444 * pointed to by put_old must yield the same directory as new_root. No other 2445 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2446 * 2447 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2448 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2449 * in this situation. 2450 * 2451 * Notes: 2452 * - we don't move root/cwd if they are not at the root (reason: if something 2453 * cared enough to change them, it's probably wrong to force them elsewhere) 2454 * - it's okay to pick a root that isn't the root of a file system, e.g. 2455 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2456 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2457 * first. 2458 */ 2459 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2460 const char __user *, put_old) 2461 { 2462 struct vfsmount *tmp; 2463 struct path new, old, parent_path, root_parent, root; 2464 int error; 2465 2466 if (!capable(CAP_SYS_ADMIN)) 2467 return -EPERM; 2468 2469 error = user_path_dir(new_root, &new); 2470 if (error) 2471 goto out0; 2472 error = -EINVAL; 2473 if (!check_mnt(new.mnt)) 2474 goto out1; 2475 2476 error = user_path_dir(put_old, &old); 2477 if (error) 2478 goto out1; 2479 2480 error = security_sb_pivotroot(&old, &new); 2481 if (error) { 2482 path_put(&old); 2483 goto out1; 2484 } 2485 2486 get_fs_root(current->fs, &root); 2487 down_write(&namespace_sem); 2488 mutex_lock(&old.dentry->d_inode->i_mutex); 2489 error = -EINVAL; 2490 if (IS_MNT_SHARED(old.mnt) || 2491 IS_MNT_SHARED(new.mnt->mnt_parent) || 2492 IS_MNT_SHARED(root.mnt->mnt_parent)) 2493 goto out2; 2494 if (!check_mnt(root.mnt)) 2495 goto out2; 2496 error = -ENOENT; 2497 if (cant_mount(old.dentry)) 2498 goto out2; 2499 if (d_unlinked(new.dentry)) 2500 goto out2; 2501 if (d_unlinked(old.dentry)) 2502 goto out2; 2503 error = -EBUSY; 2504 if (new.mnt == root.mnt || 2505 old.mnt == root.mnt) 2506 goto out2; /* loop, on the same file system */ 2507 error = -EINVAL; 2508 if (root.mnt->mnt_root != root.dentry) 2509 goto out2; /* not a mountpoint */ 2510 if (root.mnt->mnt_parent == root.mnt) 2511 goto out2; /* not attached */ 2512 if (new.mnt->mnt_root != new.dentry) 2513 goto out2; /* not a mountpoint */ 2514 if (new.mnt->mnt_parent == new.mnt) 2515 goto out2; /* not attached */ 2516 /* make sure we can reach put_old from new_root */ 2517 tmp = old.mnt; 2518 br_write_lock(vfsmount_lock); 2519 if (tmp != new.mnt) { 2520 for (;;) { 2521 if (tmp->mnt_parent == tmp) 2522 goto out3; /* already mounted on put_old */ 2523 if (tmp->mnt_parent == new.mnt) 2524 break; 2525 tmp = tmp->mnt_parent; 2526 } 2527 if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) 2528 goto out3; 2529 } else if (!is_subdir(old.dentry, new.dentry)) 2530 goto out3; 2531 detach_mnt(new.mnt, &parent_path); 2532 detach_mnt(root.mnt, &root_parent); 2533 /* mount old root on put_old */ 2534 attach_mnt(root.mnt, &old); 2535 /* mount new_root on / */ 2536 attach_mnt(new.mnt, &root_parent); 2537 touch_mnt_namespace(current->nsproxy->mnt_ns); 2538 br_write_unlock(vfsmount_lock); 2539 chroot_fs_refs(&root, &new); 2540 2541 error = 0; 2542 path_put(&root_parent); 2543 path_put(&parent_path); 2544 out2: 2545 mutex_unlock(&old.dentry->d_inode->i_mutex); 2546 up_write(&namespace_sem); 2547 path_put(&root); 2548 path_put(&old); 2549 out1: 2550 path_put(&new); 2551 out0: 2552 return error; 2553 out3: 2554 br_write_unlock(vfsmount_lock); 2555 goto out2; 2556 } 2557 2558 static void __init init_mount_tree(void) 2559 { 2560 struct vfsmount *mnt; 2561 struct mnt_namespace *ns; 2562 struct path root; 2563 2564 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2565 if (IS_ERR(mnt)) 2566 panic("Can't create rootfs"); 2567 2568 ns = create_mnt_ns(mnt); 2569 if (IS_ERR(ns)) 2570 panic("Can't allocate initial namespace"); 2571 2572 init_task.nsproxy->mnt_ns = ns; 2573 get_mnt_ns(ns); 2574 2575 root.mnt = ns->root; 2576 root.dentry = ns->root->mnt_root; 2577 2578 set_fs_pwd(current->fs, &root); 2579 set_fs_root(current->fs, &root); 2580 } 2581 2582 void __init mnt_init(void) 2583 { 2584 unsigned u; 2585 int err; 2586 2587 init_rwsem(&namespace_sem); 2588 2589 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), 2590 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2591 2592 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 2593 2594 if (!mount_hashtable) 2595 panic("Failed to allocate mount hash table\n"); 2596 2597 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 2598 2599 for (u = 0; u < HASH_SIZE; u++) 2600 INIT_LIST_HEAD(&mount_hashtable[u]); 2601 2602 br_lock_init(vfsmount_lock); 2603 2604 err = sysfs_init(); 2605 if (err) 2606 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2607 __func__, err); 2608 fs_kobj = kobject_create_and_add("fs", NULL); 2609 if (!fs_kobj) 2610 printk(KERN_WARNING "%s: kobj create error\n", __func__); 2611 init_rootfs(); 2612 init_mount_tree(); 2613 } 2614 2615 void put_mnt_ns(struct mnt_namespace *ns) 2616 { 2617 LIST_HEAD(umount_list); 2618 2619 if (!atomic_dec_and_test(&ns->count)) 2620 return; 2621 down_write(&namespace_sem); 2622 br_write_lock(vfsmount_lock); 2623 umount_tree(ns->root, 0, &umount_list); 2624 br_write_unlock(vfsmount_lock); 2625 up_write(&namespace_sem); 2626 release_mounts(&umount_list); 2627 kfree(ns); 2628 } 2629 EXPORT_SYMBOL(put_mnt_ns); 2630