xref: /openbmc/linux/fs/namespace.c (revision d7439fb1f4338fffd0bc68bb62d78f7712725f26)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  linux/fs/namespace.c
4  *
5  * (C) Copyright Al Viro 2000, 2001
6  *
7  * Based on code from fs/super.c, copyright Linus Torvalds and others.
8  * Heavily rewritten.
9  */
10 
11 #include <linux/syscalls.h>
12 #include <linux/export.h>
13 #include <linux/capability.h>
14 #include <linux/mnt_namespace.h>
15 #include <linux/user_namespace.h>
16 #include <linux/namei.h>
17 #include <linux/security.h>
18 #include <linux/cred.h>
19 #include <linux/idr.h>
20 #include <linux/init.h>		/* init_rootfs */
21 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
22 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
23 #include <linux/file.h>
24 #include <linux/uaccess.h>
25 #include <linux/proc_ns.h>
26 #include <linux/magic.h>
27 #include <linux/memblock.h>
28 #include <linux/proc_fs.h>
29 #include <linux/task_work.h>
30 #include <linux/sched/task.h>
31 #include <uapi/linux/mount.h>
32 #include <linux/fs_context.h>
33 #include <linux/shmem_fs.h>
34 #include <linux/mnt_idmapping.h>
35 
36 #include "pnode.h"
37 #include "internal.h"
38 
39 /* Maximum number of mounts in a mount namespace */
40 static unsigned int sysctl_mount_max __read_mostly = 100000;
41 
42 static unsigned int m_hash_mask __read_mostly;
43 static unsigned int m_hash_shift __read_mostly;
44 static unsigned int mp_hash_mask __read_mostly;
45 static unsigned int mp_hash_shift __read_mostly;
46 
47 static __initdata unsigned long mhash_entries;
48 static int __init set_mhash_entries(char *str)
49 {
50 	if (!str)
51 		return 0;
52 	mhash_entries = simple_strtoul(str, &str, 0);
53 	return 1;
54 }
55 __setup("mhash_entries=", set_mhash_entries);
56 
57 static __initdata unsigned long mphash_entries;
58 static int __init set_mphash_entries(char *str)
59 {
60 	if (!str)
61 		return 0;
62 	mphash_entries = simple_strtoul(str, &str, 0);
63 	return 1;
64 }
65 __setup("mphash_entries=", set_mphash_entries);
66 
67 static u64 event;
68 static DEFINE_IDA(mnt_id_ida);
69 static DEFINE_IDA(mnt_group_ida);
70 
71 static struct hlist_head *mount_hashtable __read_mostly;
72 static struct hlist_head *mountpoint_hashtable __read_mostly;
73 static struct kmem_cache *mnt_cache __read_mostly;
74 static DECLARE_RWSEM(namespace_sem);
75 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
76 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
77 
78 struct mount_kattr {
79 	unsigned int attr_set;
80 	unsigned int attr_clr;
81 	unsigned int propagation;
82 	unsigned int lookup_flags;
83 	bool recurse;
84 	struct user_namespace *mnt_userns;
85 	struct mnt_idmap *mnt_idmap;
86 };
87 
88 /* /sys/fs */
89 struct kobject *fs_kobj;
90 EXPORT_SYMBOL_GPL(fs_kobj);
91 
92 /*
93  * vfsmount lock may be taken for read to prevent changes to the
94  * vfsmount hash, ie. during mountpoint lookups or walking back
95  * up the tree.
96  *
97  * It should be taken for write in all cases where the vfsmount
98  * tree or hash is modified or when a vfsmount structure is modified.
99  */
100 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
101 
102 static inline void lock_mount_hash(void)
103 {
104 	write_seqlock(&mount_lock);
105 }
106 
107 static inline void unlock_mount_hash(void)
108 {
109 	write_sequnlock(&mount_lock);
110 }
111 
112 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
113 {
114 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
115 	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
116 	tmp = tmp + (tmp >> m_hash_shift);
117 	return &mount_hashtable[tmp & m_hash_mask];
118 }
119 
120 static inline struct hlist_head *mp_hash(struct dentry *dentry)
121 {
122 	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
123 	tmp = tmp + (tmp >> mp_hash_shift);
124 	return &mountpoint_hashtable[tmp & mp_hash_mask];
125 }
126 
127 static int mnt_alloc_id(struct mount *mnt)
128 {
129 	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
130 
131 	if (res < 0)
132 		return res;
133 	mnt->mnt_id = res;
134 	return 0;
135 }
136 
137 static void mnt_free_id(struct mount *mnt)
138 {
139 	ida_free(&mnt_id_ida, mnt->mnt_id);
140 }
141 
142 /*
143  * Allocate a new peer group ID
144  */
145 static int mnt_alloc_group_id(struct mount *mnt)
146 {
147 	int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
148 
149 	if (res < 0)
150 		return res;
151 	mnt->mnt_group_id = res;
152 	return 0;
153 }
154 
155 /*
156  * Release a peer group ID
157  */
158 void mnt_release_group_id(struct mount *mnt)
159 {
160 	ida_free(&mnt_group_ida, mnt->mnt_group_id);
161 	mnt->mnt_group_id = 0;
162 }
163 
164 /*
165  * vfsmount lock must be held for read
166  */
167 static inline void mnt_add_count(struct mount *mnt, int n)
168 {
169 #ifdef CONFIG_SMP
170 	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
171 #else
172 	preempt_disable();
173 	mnt->mnt_count += n;
174 	preempt_enable();
175 #endif
176 }
177 
178 /*
179  * vfsmount lock must be held for write
180  */
181 int mnt_get_count(struct mount *mnt)
182 {
183 #ifdef CONFIG_SMP
184 	int count = 0;
185 	int cpu;
186 
187 	for_each_possible_cpu(cpu) {
188 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
189 	}
190 
191 	return count;
192 #else
193 	return mnt->mnt_count;
194 #endif
195 }
196 
197 static struct mount *alloc_vfsmnt(const char *name)
198 {
199 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
200 	if (mnt) {
201 		int err;
202 
203 		err = mnt_alloc_id(mnt);
204 		if (err)
205 			goto out_free_cache;
206 
207 		if (name) {
208 			mnt->mnt_devname = kstrdup_const(name,
209 							 GFP_KERNEL_ACCOUNT);
210 			if (!mnt->mnt_devname)
211 				goto out_free_id;
212 		}
213 
214 #ifdef CONFIG_SMP
215 		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
216 		if (!mnt->mnt_pcp)
217 			goto out_free_devname;
218 
219 		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
220 #else
221 		mnt->mnt_count = 1;
222 		mnt->mnt_writers = 0;
223 #endif
224 
225 		INIT_HLIST_NODE(&mnt->mnt_hash);
226 		INIT_LIST_HEAD(&mnt->mnt_child);
227 		INIT_LIST_HEAD(&mnt->mnt_mounts);
228 		INIT_LIST_HEAD(&mnt->mnt_list);
229 		INIT_LIST_HEAD(&mnt->mnt_expire);
230 		INIT_LIST_HEAD(&mnt->mnt_share);
231 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
232 		INIT_LIST_HEAD(&mnt->mnt_slave);
233 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
234 		INIT_LIST_HEAD(&mnt->mnt_umounting);
235 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
236 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
237 	}
238 	return mnt;
239 
240 #ifdef CONFIG_SMP
241 out_free_devname:
242 	kfree_const(mnt->mnt_devname);
243 #endif
244 out_free_id:
245 	mnt_free_id(mnt);
246 out_free_cache:
247 	kmem_cache_free(mnt_cache, mnt);
248 	return NULL;
249 }
250 
251 /*
252  * Most r/o checks on a fs are for operations that take
253  * discrete amounts of time, like a write() or unlink().
254  * We must keep track of when those operations start
255  * (for permission checks) and when they end, so that
256  * we can determine when writes are able to occur to
257  * a filesystem.
258  */
259 /*
260  * __mnt_is_readonly: check whether a mount is read-only
261  * @mnt: the mount to check for its write status
262  *
263  * This shouldn't be used directly ouside of the VFS.
264  * It does not guarantee that the filesystem will stay
265  * r/w, just that it is right *now*.  This can not and
266  * should not be used in place of IS_RDONLY(inode).
267  * mnt_want/drop_write() will _keep_ the filesystem
268  * r/w.
269  */
270 bool __mnt_is_readonly(struct vfsmount *mnt)
271 {
272 	return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
273 }
274 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
275 
276 static inline void mnt_inc_writers(struct mount *mnt)
277 {
278 #ifdef CONFIG_SMP
279 	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
280 #else
281 	mnt->mnt_writers++;
282 #endif
283 }
284 
285 static inline void mnt_dec_writers(struct mount *mnt)
286 {
287 #ifdef CONFIG_SMP
288 	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
289 #else
290 	mnt->mnt_writers--;
291 #endif
292 }
293 
294 static unsigned int mnt_get_writers(struct mount *mnt)
295 {
296 #ifdef CONFIG_SMP
297 	unsigned int count = 0;
298 	int cpu;
299 
300 	for_each_possible_cpu(cpu) {
301 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
302 	}
303 
304 	return count;
305 #else
306 	return mnt->mnt_writers;
307 #endif
308 }
309 
310 static int mnt_is_readonly(struct vfsmount *mnt)
311 {
312 	if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
313 		return 1;
314 	/*
315 	 * The barrier pairs with the barrier in sb_start_ro_state_change()
316 	 * making sure if we don't see s_readonly_remount set yet, we also will
317 	 * not see any superblock / mount flag changes done by remount.
318 	 * It also pairs with the barrier in sb_end_ro_state_change()
319 	 * assuring that if we see s_readonly_remount already cleared, we will
320 	 * see the values of superblock / mount flags updated by remount.
321 	 */
322 	smp_rmb();
323 	return __mnt_is_readonly(mnt);
324 }
325 
326 /*
327  * Most r/o & frozen checks on a fs are for operations that take discrete
328  * amounts of time, like a write() or unlink().  We must keep track of when
329  * those operations start (for permission checks) and when they end, so that we
330  * can determine when writes are able to occur to a filesystem.
331  */
332 /**
333  * __mnt_want_write - get write access to a mount without freeze protection
334  * @m: the mount on which to take a write
335  *
336  * This tells the low-level filesystem that a write is about to be performed to
337  * it, and makes sure that writes are allowed (mnt it read-write) before
338  * returning success. This operation does not protect against filesystem being
339  * frozen. When the write operation is finished, __mnt_drop_write() must be
340  * called. This is effectively a refcount.
341  */
342 int __mnt_want_write(struct vfsmount *m)
343 {
344 	struct mount *mnt = real_mount(m);
345 	int ret = 0;
346 
347 	preempt_disable();
348 	mnt_inc_writers(mnt);
349 	/*
350 	 * The store to mnt_inc_writers must be visible before we pass
351 	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
352 	 * incremented count after it has set MNT_WRITE_HOLD.
353 	 */
354 	smp_mb();
355 	might_lock(&mount_lock.lock);
356 	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
357 		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
358 			cpu_relax();
359 		} else {
360 			/*
361 			 * This prevents priority inversion, if the task
362 			 * setting MNT_WRITE_HOLD got preempted on a remote
363 			 * CPU, and it prevents life lock if the task setting
364 			 * MNT_WRITE_HOLD has a lower priority and is bound to
365 			 * the same CPU as the task that is spinning here.
366 			 */
367 			preempt_enable();
368 			lock_mount_hash();
369 			unlock_mount_hash();
370 			preempt_disable();
371 		}
372 	}
373 	/*
374 	 * The barrier pairs with the barrier sb_start_ro_state_change() making
375 	 * sure that if we see MNT_WRITE_HOLD cleared, we will also see
376 	 * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
377 	 * mnt_is_readonly() and bail in case we are racing with remount
378 	 * read-only.
379 	 */
380 	smp_rmb();
381 	if (mnt_is_readonly(m)) {
382 		mnt_dec_writers(mnt);
383 		ret = -EROFS;
384 	}
385 	preempt_enable();
386 
387 	return ret;
388 }
389 
390 /**
391  * mnt_want_write - get write access to a mount
392  * @m: the mount on which to take a write
393  *
394  * This tells the low-level filesystem that a write is about to be performed to
395  * it, and makes sure that writes are allowed (mount is read-write, filesystem
396  * is not frozen) before returning success.  When the write operation is
397  * finished, mnt_drop_write() must be called.  This is effectively a refcount.
398  */
399 int mnt_want_write(struct vfsmount *m)
400 {
401 	int ret;
402 
403 	sb_start_write(m->mnt_sb);
404 	ret = __mnt_want_write(m);
405 	if (ret)
406 		sb_end_write(m->mnt_sb);
407 	return ret;
408 }
409 EXPORT_SYMBOL_GPL(mnt_want_write);
410 
411 /**
412  * __mnt_want_write_file - get write access to a file's mount
413  * @file: the file who's mount on which to take a write
414  *
415  * This is like __mnt_want_write, but if the file is already open for writing it
416  * skips incrementing mnt_writers (since the open file already has a reference)
417  * and instead only does the check for emergency r/o remounts.  This must be
418  * paired with __mnt_drop_write_file.
419  */
420 int __mnt_want_write_file(struct file *file)
421 {
422 	if (file->f_mode & FMODE_WRITER) {
423 		/*
424 		 * Superblock may have become readonly while there are still
425 		 * writable fd's, e.g. due to a fs error with errors=remount-ro
426 		 */
427 		if (__mnt_is_readonly(file->f_path.mnt))
428 			return -EROFS;
429 		return 0;
430 	}
431 	return __mnt_want_write(file->f_path.mnt);
432 }
433 
434 /**
435  * mnt_want_write_file - get write access to a file's mount
436  * @file: the file who's mount on which to take a write
437  *
438  * This is like mnt_want_write, but if the file is already open for writing it
439  * skips incrementing mnt_writers (since the open file already has a reference)
440  * and instead only does the freeze protection and the check for emergency r/o
441  * remounts.  This must be paired with mnt_drop_write_file.
442  */
443 int mnt_want_write_file(struct file *file)
444 {
445 	int ret;
446 
447 	sb_start_write(file_inode(file)->i_sb);
448 	ret = __mnt_want_write_file(file);
449 	if (ret)
450 		sb_end_write(file_inode(file)->i_sb);
451 	return ret;
452 }
453 EXPORT_SYMBOL_GPL(mnt_want_write_file);
454 
455 /**
456  * __mnt_drop_write - give up write access to a mount
457  * @mnt: the mount on which to give up write access
458  *
459  * Tells the low-level filesystem that we are done
460  * performing writes to it.  Must be matched with
461  * __mnt_want_write() call above.
462  */
463 void __mnt_drop_write(struct vfsmount *mnt)
464 {
465 	preempt_disable();
466 	mnt_dec_writers(real_mount(mnt));
467 	preempt_enable();
468 }
469 
470 /**
471  * mnt_drop_write - give up write access to a mount
472  * @mnt: the mount on which to give up write access
473  *
474  * Tells the low-level filesystem that we are done performing writes to it and
475  * also allows filesystem to be frozen again.  Must be matched with
476  * mnt_want_write() call above.
477  */
478 void mnt_drop_write(struct vfsmount *mnt)
479 {
480 	__mnt_drop_write(mnt);
481 	sb_end_write(mnt->mnt_sb);
482 }
483 EXPORT_SYMBOL_GPL(mnt_drop_write);
484 
485 void __mnt_drop_write_file(struct file *file)
486 {
487 	if (!(file->f_mode & FMODE_WRITER))
488 		__mnt_drop_write(file->f_path.mnt);
489 }
490 
491 void mnt_drop_write_file(struct file *file)
492 {
493 	__mnt_drop_write_file(file);
494 	sb_end_write(file_inode(file)->i_sb);
495 }
496 EXPORT_SYMBOL(mnt_drop_write_file);
497 
498 /**
499  * mnt_hold_writers - prevent write access to the given mount
500  * @mnt: mnt to prevent write access to
501  *
502  * Prevents write access to @mnt if there are no active writers for @mnt.
503  * This function needs to be called and return successfully before changing
504  * properties of @mnt that need to remain stable for callers with write access
505  * to @mnt.
506  *
507  * After this functions has been called successfully callers must pair it with
508  * a call to mnt_unhold_writers() in order to stop preventing write access to
509  * @mnt.
510  *
511  * Context: This function expects lock_mount_hash() to be held serializing
512  *          setting MNT_WRITE_HOLD.
513  * Return: On success 0 is returned.
514  *	   On error, -EBUSY is returned.
515  */
516 static inline int mnt_hold_writers(struct mount *mnt)
517 {
518 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
519 	/*
520 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
521 	 * should be visible before we do.
522 	 */
523 	smp_mb();
524 
525 	/*
526 	 * With writers on hold, if this value is zero, then there are
527 	 * definitely no active writers (although held writers may subsequently
528 	 * increment the count, they'll have to wait, and decrement it after
529 	 * seeing MNT_READONLY).
530 	 *
531 	 * It is OK to have counter incremented on one CPU and decremented on
532 	 * another: the sum will add up correctly. The danger would be when we
533 	 * sum up each counter, if we read a counter before it is incremented,
534 	 * but then read another CPU's count which it has been subsequently
535 	 * decremented from -- we would see more decrements than we should.
536 	 * MNT_WRITE_HOLD protects against this scenario, because
537 	 * mnt_want_write first increments count, then smp_mb, then spins on
538 	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
539 	 * we're counting up here.
540 	 */
541 	if (mnt_get_writers(mnt) > 0)
542 		return -EBUSY;
543 
544 	return 0;
545 }
546 
547 /**
548  * mnt_unhold_writers - stop preventing write access to the given mount
549  * @mnt: mnt to stop preventing write access to
550  *
551  * Stop preventing write access to @mnt allowing callers to gain write access
552  * to @mnt again.
553  *
554  * This function can only be called after a successful call to
555  * mnt_hold_writers().
556  *
557  * Context: This function expects lock_mount_hash() to be held.
558  */
559 static inline void mnt_unhold_writers(struct mount *mnt)
560 {
561 	/*
562 	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
563 	 * that become unheld will see MNT_READONLY.
564 	 */
565 	smp_wmb();
566 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
567 }
568 
569 static int mnt_make_readonly(struct mount *mnt)
570 {
571 	int ret;
572 
573 	ret = mnt_hold_writers(mnt);
574 	if (!ret)
575 		mnt->mnt.mnt_flags |= MNT_READONLY;
576 	mnt_unhold_writers(mnt);
577 	return ret;
578 }
579 
580 int sb_prepare_remount_readonly(struct super_block *sb)
581 {
582 	struct mount *mnt;
583 	int err = 0;
584 
585 	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
586 	if (atomic_long_read(&sb->s_remove_count))
587 		return -EBUSY;
588 
589 	lock_mount_hash();
590 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
591 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
592 			err = mnt_hold_writers(mnt);
593 			if (err)
594 				break;
595 		}
596 	}
597 	if (!err && atomic_long_read(&sb->s_remove_count))
598 		err = -EBUSY;
599 
600 	if (!err)
601 		sb_start_ro_state_change(sb);
602 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
603 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
604 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
605 	}
606 	unlock_mount_hash();
607 
608 	return err;
609 }
610 
611 static void free_vfsmnt(struct mount *mnt)
612 {
613 	mnt_idmap_put(mnt_idmap(&mnt->mnt));
614 	kfree_const(mnt->mnt_devname);
615 #ifdef CONFIG_SMP
616 	free_percpu(mnt->mnt_pcp);
617 #endif
618 	kmem_cache_free(mnt_cache, mnt);
619 }
620 
621 static void delayed_free_vfsmnt(struct rcu_head *head)
622 {
623 	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
624 }
625 
626 /* call under rcu_read_lock */
627 int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
628 {
629 	struct mount *mnt;
630 	if (read_seqretry(&mount_lock, seq))
631 		return 1;
632 	if (bastard == NULL)
633 		return 0;
634 	mnt = real_mount(bastard);
635 	mnt_add_count(mnt, 1);
636 	smp_mb();			// see mntput_no_expire()
637 	if (likely(!read_seqretry(&mount_lock, seq)))
638 		return 0;
639 	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
640 		mnt_add_count(mnt, -1);
641 		return 1;
642 	}
643 	lock_mount_hash();
644 	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
645 		mnt_add_count(mnt, -1);
646 		unlock_mount_hash();
647 		return 1;
648 	}
649 	unlock_mount_hash();
650 	/* caller will mntput() */
651 	return -1;
652 }
653 
654 /* call under rcu_read_lock */
655 static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
656 {
657 	int res = __legitimize_mnt(bastard, seq);
658 	if (likely(!res))
659 		return true;
660 	if (unlikely(res < 0)) {
661 		rcu_read_unlock();
662 		mntput(bastard);
663 		rcu_read_lock();
664 	}
665 	return false;
666 }
667 
668 /*
669  * find the first mount at @dentry on vfsmount @mnt.
670  * call under rcu_read_lock()
671  */
672 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
673 {
674 	struct hlist_head *head = m_hash(mnt, dentry);
675 	struct mount *p;
676 
677 	hlist_for_each_entry_rcu(p, head, mnt_hash)
678 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
679 			return p;
680 	return NULL;
681 }
682 
683 /*
684  * lookup_mnt - Return the first child mount mounted at path
685  *
686  * "First" means first mounted chronologically.  If you create the
687  * following mounts:
688  *
689  * mount /dev/sda1 /mnt
690  * mount /dev/sda2 /mnt
691  * mount /dev/sda3 /mnt
692  *
693  * Then lookup_mnt() on the base /mnt dentry in the root mount will
694  * return successively the root dentry and vfsmount of /dev/sda1, then
695  * /dev/sda2, then /dev/sda3, then NULL.
696  *
697  * lookup_mnt takes a reference to the found vfsmount.
698  */
699 struct vfsmount *lookup_mnt(const struct path *path)
700 {
701 	struct mount *child_mnt;
702 	struct vfsmount *m;
703 	unsigned seq;
704 
705 	rcu_read_lock();
706 	do {
707 		seq = read_seqbegin(&mount_lock);
708 		child_mnt = __lookup_mnt(path->mnt, path->dentry);
709 		m = child_mnt ? &child_mnt->mnt : NULL;
710 	} while (!legitimize_mnt(m, seq));
711 	rcu_read_unlock();
712 	return m;
713 }
714 
715 static inline void lock_ns_list(struct mnt_namespace *ns)
716 {
717 	spin_lock(&ns->ns_lock);
718 }
719 
720 static inline void unlock_ns_list(struct mnt_namespace *ns)
721 {
722 	spin_unlock(&ns->ns_lock);
723 }
724 
725 static inline bool mnt_is_cursor(struct mount *mnt)
726 {
727 	return mnt->mnt.mnt_flags & MNT_CURSOR;
728 }
729 
730 /*
731  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
732  *                         current mount namespace.
733  *
734  * The common case is dentries are not mountpoints at all and that
735  * test is handled inline.  For the slow case when we are actually
736  * dealing with a mountpoint of some kind, walk through all of the
737  * mounts in the current mount namespace and test to see if the dentry
738  * is a mountpoint.
739  *
740  * The mount_hashtable is not usable in the context because we
741  * need to identify all mounts that may be in the current mount
742  * namespace not just a mount that happens to have some specified
743  * parent mount.
744  */
745 bool __is_local_mountpoint(struct dentry *dentry)
746 {
747 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
748 	struct mount *mnt;
749 	bool is_covered = false;
750 
751 	down_read(&namespace_sem);
752 	lock_ns_list(ns);
753 	list_for_each_entry(mnt, &ns->list, mnt_list) {
754 		if (mnt_is_cursor(mnt))
755 			continue;
756 		is_covered = (mnt->mnt_mountpoint == dentry);
757 		if (is_covered)
758 			break;
759 	}
760 	unlock_ns_list(ns);
761 	up_read(&namespace_sem);
762 
763 	return is_covered;
764 }
765 
766 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
767 {
768 	struct hlist_head *chain = mp_hash(dentry);
769 	struct mountpoint *mp;
770 
771 	hlist_for_each_entry(mp, chain, m_hash) {
772 		if (mp->m_dentry == dentry) {
773 			mp->m_count++;
774 			return mp;
775 		}
776 	}
777 	return NULL;
778 }
779 
780 static struct mountpoint *get_mountpoint(struct dentry *dentry)
781 {
782 	struct mountpoint *mp, *new = NULL;
783 	int ret;
784 
785 	if (d_mountpoint(dentry)) {
786 		/* might be worth a WARN_ON() */
787 		if (d_unlinked(dentry))
788 			return ERR_PTR(-ENOENT);
789 mountpoint:
790 		read_seqlock_excl(&mount_lock);
791 		mp = lookup_mountpoint(dentry);
792 		read_sequnlock_excl(&mount_lock);
793 		if (mp)
794 			goto done;
795 	}
796 
797 	if (!new)
798 		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
799 	if (!new)
800 		return ERR_PTR(-ENOMEM);
801 
802 
803 	/* Exactly one processes may set d_mounted */
804 	ret = d_set_mounted(dentry);
805 
806 	/* Someone else set d_mounted? */
807 	if (ret == -EBUSY)
808 		goto mountpoint;
809 
810 	/* The dentry is not available as a mountpoint? */
811 	mp = ERR_PTR(ret);
812 	if (ret)
813 		goto done;
814 
815 	/* Add the new mountpoint to the hash table */
816 	read_seqlock_excl(&mount_lock);
817 	new->m_dentry = dget(dentry);
818 	new->m_count = 1;
819 	hlist_add_head(&new->m_hash, mp_hash(dentry));
820 	INIT_HLIST_HEAD(&new->m_list);
821 	read_sequnlock_excl(&mount_lock);
822 
823 	mp = new;
824 	new = NULL;
825 done:
826 	kfree(new);
827 	return mp;
828 }
829 
830 /*
831  * vfsmount lock must be held.  Additionally, the caller is responsible
832  * for serializing calls for given disposal list.
833  */
834 static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
835 {
836 	if (!--mp->m_count) {
837 		struct dentry *dentry = mp->m_dentry;
838 		BUG_ON(!hlist_empty(&mp->m_list));
839 		spin_lock(&dentry->d_lock);
840 		dentry->d_flags &= ~DCACHE_MOUNTED;
841 		spin_unlock(&dentry->d_lock);
842 		dput_to_list(dentry, list);
843 		hlist_del(&mp->m_hash);
844 		kfree(mp);
845 	}
846 }
847 
848 /* called with namespace_lock and vfsmount lock */
849 static void put_mountpoint(struct mountpoint *mp)
850 {
851 	__put_mountpoint(mp, &ex_mountpoints);
852 }
853 
854 static inline int check_mnt(struct mount *mnt)
855 {
856 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
857 }
858 
859 /*
860  * vfsmount lock must be held for write
861  */
862 static void touch_mnt_namespace(struct mnt_namespace *ns)
863 {
864 	if (ns) {
865 		ns->event = ++event;
866 		wake_up_interruptible(&ns->poll);
867 	}
868 }
869 
870 /*
871  * vfsmount lock must be held for write
872  */
873 static void __touch_mnt_namespace(struct mnt_namespace *ns)
874 {
875 	if (ns && ns->event != event) {
876 		ns->event = event;
877 		wake_up_interruptible(&ns->poll);
878 	}
879 }
880 
881 /*
882  * vfsmount lock must be held for write
883  */
884 static struct mountpoint *unhash_mnt(struct mount *mnt)
885 {
886 	struct mountpoint *mp;
887 	mnt->mnt_parent = mnt;
888 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
889 	list_del_init(&mnt->mnt_child);
890 	hlist_del_init_rcu(&mnt->mnt_hash);
891 	hlist_del_init(&mnt->mnt_mp_list);
892 	mp = mnt->mnt_mp;
893 	mnt->mnt_mp = NULL;
894 	return mp;
895 }
896 
897 /*
898  * vfsmount lock must be held for write
899  */
900 static void umount_mnt(struct mount *mnt)
901 {
902 	put_mountpoint(unhash_mnt(mnt));
903 }
904 
905 /*
906  * vfsmount lock must be held for write
907  */
908 void mnt_set_mountpoint(struct mount *mnt,
909 			struct mountpoint *mp,
910 			struct mount *child_mnt)
911 {
912 	mp->m_count++;
913 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
914 	child_mnt->mnt_mountpoint = mp->m_dentry;
915 	child_mnt->mnt_parent = mnt;
916 	child_mnt->mnt_mp = mp;
917 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
918 }
919 
920 static void __attach_mnt(struct mount *mnt, struct mount *parent)
921 {
922 	hlist_add_head_rcu(&mnt->mnt_hash,
923 			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
924 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
925 }
926 
927 /*
928  * vfsmount lock must be held for write
929  */
930 static void attach_mnt(struct mount *mnt,
931 			struct mount *parent,
932 			struct mountpoint *mp)
933 {
934 	mnt_set_mountpoint(parent, mp, mnt);
935 	__attach_mnt(mnt, parent);
936 }
937 
938 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
939 {
940 	struct mountpoint *old_mp = mnt->mnt_mp;
941 	struct mount *old_parent = mnt->mnt_parent;
942 
943 	list_del_init(&mnt->mnt_child);
944 	hlist_del_init(&mnt->mnt_mp_list);
945 	hlist_del_init_rcu(&mnt->mnt_hash);
946 
947 	attach_mnt(mnt, parent, mp);
948 
949 	put_mountpoint(old_mp);
950 	mnt_add_count(old_parent, -1);
951 }
952 
953 /*
954  * vfsmount lock must be held for write
955  */
956 static void commit_tree(struct mount *mnt)
957 {
958 	struct mount *parent = mnt->mnt_parent;
959 	struct mount *m;
960 	LIST_HEAD(head);
961 	struct mnt_namespace *n = parent->mnt_ns;
962 
963 	BUG_ON(parent == mnt);
964 
965 	list_add_tail(&head, &mnt->mnt_list);
966 	list_for_each_entry(m, &head, mnt_list)
967 		m->mnt_ns = n;
968 
969 	list_splice(&head, n->list.prev);
970 
971 	n->mounts += n->pending_mounts;
972 	n->pending_mounts = 0;
973 
974 	__attach_mnt(mnt, parent);
975 	touch_mnt_namespace(n);
976 }
977 
978 static struct mount *next_mnt(struct mount *p, struct mount *root)
979 {
980 	struct list_head *next = p->mnt_mounts.next;
981 	if (next == &p->mnt_mounts) {
982 		while (1) {
983 			if (p == root)
984 				return NULL;
985 			next = p->mnt_child.next;
986 			if (next != &p->mnt_parent->mnt_mounts)
987 				break;
988 			p = p->mnt_parent;
989 		}
990 	}
991 	return list_entry(next, struct mount, mnt_child);
992 }
993 
994 static struct mount *skip_mnt_tree(struct mount *p)
995 {
996 	struct list_head *prev = p->mnt_mounts.prev;
997 	while (prev != &p->mnt_mounts) {
998 		p = list_entry(prev, struct mount, mnt_child);
999 		prev = p->mnt_mounts.prev;
1000 	}
1001 	return p;
1002 }
1003 
1004 /**
1005  * vfs_create_mount - Create a mount for a configured superblock
1006  * @fc: The configuration context with the superblock attached
1007  *
1008  * Create a mount to an already configured superblock.  If necessary, the
1009  * caller should invoke vfs_get_tree() before calling this.
1010  *
1011  * Note that this does not attach the mount to anything.
1012  */
1013 struct vfsmount *vfs_create_mount(struct fs_context *fc)
1014 {
1015 	struct mount *mnt;
1016 
1017 	if (!fc->root)
1018 		return ERR_PTR(-EINVAL);
1019 
1020 	mnt = alloc_vfsmnt(fc->source ?: "none");
1021 	if (!mnt)
1022 		return ERR_PTR(-ENOMEM);
1023 
1024 	if (fc->sb_flags & SB_KERNMOUNT)
1025 		mnt->mnt.mnt_flags = MNT_INTERNAL;
1026 
1027 	atomic_inc(&fc->root->d_sb->s_active);
1028 	mnt->mnt.mnt_sb		= fc->root->d_sb;
1029 	mnt->mnt.mnt_root	= dget(fc->root);
1030 	mnt->mnt_mountpoint	= mnt->mnt.mnt_root;
1031 	mnt->mnt_parent		= mnt;
1032 
1033 	lock_mount_hash();
1034 	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
1035 	unlock_mount_hash();
1036 	return &mnt->mnt;
1037 }
1038 EXPORT_SYMBOL(vfs_create_mount);
1039 
1040 struct vfsmount *fc_mount(struct fs_context *fc)
1041 {
1042 	int err = vfs_get_tree(fc);
1043 	if (!err) {
1044 		up_write(&fc->root->d_sb->s_umount);
1045 		return vfs_create_mount(fc);
1046 	}
1047 	return ERR_PTR(err);
1048 }
1049 EXPORT_SYMBOL(fc_mount);
1050 
1051 struct vfsmount *vfs_kern_mount(struct file_system_type *type,
1052 				int flags, const char *name,
1053 				void *data)
1054 {
1055 	struct fs_context *fc;
1056 	struct vfsmount *mnt;
1057 	int ret = 0;
1058 
1059 	if (!type)
1060 		return ERR_PTR(-EINVAL);
1061 
1062 	fc = fs_context_for_mount(type, flags);
1063 	if (IS_ERR(fc))
1064 		return ERR_CAST(fc);
1065 
1066 	if (name)
1067 		ret = vfs_parse_fs_string(fc, "source",
1068 					  name, strlen(name));
1069 	if (!ret)
1070 		ret = parse_monolithic_mount_data(fc, data);
1071 	if (!ret)
1072 		mnt = fc_mount(fc);
1073 	else
1074 		mnt = ERR_PTR(ret);
1075 
1076 	put_fs_context(fc);
1077 	return mnt;
1078 }
1079 EXPORT_SYMBOL_GPL(vfs_kern_mount);
1080 
1081 struct vfsmount *
1082 vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
1083 	     const char *name, void *data)
1084 {
1085 	/* Until it is worked out how to pass the user namespace
1086 	 * through from the parent mount to the submount don't support
1087 	 * unprivileged mounts with submounts.
1088 	 */
1089 	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1090 		return ERR_PTR(-EPERM);
1091 
1092 	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1093 }
1094 EXPORT_SYMBOL_GPL(vfs_submount);
1095 
1096 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
1097 					int flag)
1098 {
1099 	struct super_block *sb = old->mnt.mnt_sb;
1100 	struct mount *mnt;
1101 	int err;
1102 
1103 	mnt = alloc_vfsmnt(old->mnt_devname);
1104 	if (!mnt)
1105 		return ERR_PTR(-ENOMEM);
1106 
1107 	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1108 		mnt->mnt_group_id = 0; /* not a peer of original */
1109 	else
1110 		mnt->mnt_group_id = old->mnt_group_id;
1111 
1112 	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1113 		err = mnt_alloc_group_id(mnt);
1114 		if (err)
1115 			goto out_free;
1116 	}
1117 
1118 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1119 	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1120 
1121 	atomic_inc(&sb->s_active);
1122 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
1123 
1124 	mnt->mnt.mnt_sb = sb;
1125 	mnt->mnt.mnt_root = dget(root);
1126 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1127 	mnt->mnt_parent = mnt;
1128 	lock_mount_hash();
1129 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1130 	unlock_mount_hash();
1131 
1132 	if ((flag & CL_SLAVE) ||
1133 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1134 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
1135 		mnt->mnt_master = old;
1136 		CLEAR_MNT_SHARED(mnt);
1137 	} else if (!(flag & CL_PRIVATE)) {
1138 		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
1139 			list_add(&mnt->mnt_share, &old->mnt_share);
1140 		if (IS_MNT_SLAVE(old))
1141 			list_add(&mnt->mnt_slave, &old->mnt_slave);
1142 		mnt->mnt_master = old->mnt_master;
1143 	} else {
1144 		CLEAR_MNT_SHARED(mnt);
1145 	}
1146 	if (flag & CL_MAKE_SHARED)
1147 		set_mnt_shared(mnt);
1148 
1149 	/* stick the duplicate mount on the same expiry list
1150 	 * as the original if that was on one */
1151 	if (flag & CL_EXPIRE) {
1152 		if (!list_empty(&old->mnt_expire))
1153 			list_add(&mnt->mnt_expire, &old->mnt_expire);
1154 	}
1155 
1156 	return mnt;
1157 
1158  out_free:
1159 	mnt_free_id(mnt);
1160 	free_vfsmnt(mnt);
1161 	return ERR_PTR(err);
1162 }
1163 
1164 static void cleanup_mnt(struct mount *mnt)
1165 {
1166 	struct hlist_node *p;
1167 	struct mount *m;
1168 	/*
1169 	 * The warning here probably indicates that somebody messed
1170 	 * up a mnt_want/drop_write() pair.  If this happens, the
1171 	 * filesystem was probably unable to make r/w->r/o transitions.
1172 	 * The locking used to deal with mnt_count decrement provides barriers,
1173 	 * so mnt_get_writers() below is safe.
1174 	 */
1175 	WARN_ON(mnt_get_writers(mnt));
1176 	if (unlikely(mnt->mnt_pins.first))
1177 		mnt_pin_kill(mnt);
1178 	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1179 		hlist_del(&m->mnt_umount);
1180 		mntput(&m->mnt);
1181 	}
1182 	fsnotify_vfsmount_delete(&mnt->mnt);
1183 	dput(mnt->mnt.mnt_root);
1184 	deactivate_super(mnt->mnt.mnt_sb);
1185 	mnt_free_id(mnt);
1186 	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1187 }
1188 
1189 static void __cleanup_mnt(struct rcu_head *head)
1190 {
1191 	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1192 }
1193 
1194 static LLIST_HEAD(delayed_mntput_list);
1195 static void delayed_mntput(struct work_struct *unused)
1196 {
1197 	struct llist_node *node = llist_del_all(&delayed_mntput_list);
1198 	struct mount *m, *t;
1199 
1200 	llist_for_each_entry_safe(m, t, node, mnt_llist)
1201 		cleanup_mnt(m);
1202 }
1203 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1204 
1205 static void mntput_no_expire(struct mount *mnt)
1206 {
1207 	LIST_HEAD(list);
1208 	int count;
1209 
1210 	rcu_read_lock();
1211 	if (likely(READ_ONCE(mnt->mnt_ns))) {
1212 		/*
1213 		 * Since we don't do lock_mount_hash() here,
1214 		 * ->mnt_ns can change under us.  However, if it's
1215 		 * non-NULL, then there's a reference that won't
1216 		 * be dropped until after an RCU delay done after
1217 		 * turning ->mnt_ns NULL.  So if we observe it
1218 		 * non-NULL under rcu_read_lock(), the reference
1219 		 * we are dropping is not the final one.
1220 		 */
1221 		mnt_add_count(mnt, -1);
1222 		rcu_read_unlock();
1223 		return;
1224 	}
1225 	lock_mount_hash();
1226 	/*
1227 	 * make sure that if __legitimize_mnt() has not seen us grab
1228 	 * mount_lock, we'll see their refcount increment here.
1229 	 */
1230 	smp_mb();
1231 	mnt_add_count(mnt, -1);
1232 	count = mnt_get_count(mnt);
1233 	if (count != 0) {
1234 		WARN_ON(count < 0);
1235 		rcu_read_unlock();
1236 		unlock_mount_hash();
1237 		return;
1238 	}
1239 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1240 		rcu_read_unlock();
1241 		unlock_mount_hash();
1242 		return;
1243 	}
1244 	mnt->mnt.mnt_flags |= MNT_DOOMED;
1245 	rcu_read_unlock();
1246 
1247 	list_del(&mnt->mnt_instance);
1248 
1249 	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1250 		struct mount *p, *tmp;
1251 		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
1252 			__put_mountpoint(unhash_mnt(p), &list);
1253 			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1254 		}
1255 	}
1256 	unlock_mount_hash();
1257 	shrink_dentry_list(&list);
1258 
1259 	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1260 		struct task_struct *task = current;
1261 		if (likely(!(task->flags & PF_KTHREAD))) {
1262 			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1263 			if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1264 				return;
1265 		}
1266 		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1267 			schedule_delayed_work(&delayed_mntput_work, 1);
1268 		return;
1269 	}
1270 	cleanup_mnt(mnt);
1271 }
1272 
1273 void mntput(struct vfsmount *mnt)
1274 {
1275 	if (mnt) {
1276 		struct mount *m = real_mount(mnt);
1277 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1278 		if (unlikely(m->mnt_expiry_mark))
1279 			m->mnt_expiry_mark = 0;
1280 		mntput_no_expire(m);
1281 	}
1282 }
1283 EXPORT_SYMBOL(mntput);
1284 
1285 struct vfsmount *mntget(struct vfsmount *mnt)
1286 {
1287 	if (mnt)
1288 		mnt_add_count(real_mount(mnt), 1);
1289 	return mnt;
1290 }
1291 EXPORT_SYMBOL(mntget);
1292 
1293 /*
1294  * Make a mount point inaccessible to new lookups.
1295  * Because there may still be current users, the caller MUST WAIT
1296  * for an RCU grace period before destroying the mount point.
1297  */
1298 void mnt_make_shortterm(struct vfsmount *mnt)
1299 {
1300 	if (mnt)
1301 		real_mount(mnt)->mnt_ns = NULL;
1302 }
1303 
1304 /**
1305  * path_is_mountpoint() - Check if path is a mount in the current namespace.
1306  * @path: path to check
1307  *
1308  *  d_mountpoint() can only be used reliably to establish if a dentry is
1309  *  not mounted in any namespace and that common case is handled inline.
1310  *  d_mountpoint() isn't aware of the possibility there may be multiple
1311  *  mounts using a given dentry in a different namespace. This function
1312  *  checks if the passed in path is a mountpoint rather than the dentry
1313  *  alone.
1314  */
1315 bool path_is_mountpoint(const struct path *path)
1316 {
1317 	unsigned seq;
1318 	bool res;
1319 
1320 	if (!d_mountpoint(path->dentry))
1321 		return false;
1322 
1323 	rcu_read_lock();
1324 	do {
1325 		seq = read_seqbegin(&mount_lock);
1326 		res = __path_is_mountpoint(path);
1327 	} while (read_seqretry(&mount_lock, seq));
1328 	rcu_read_unlock();
1329 
1330 	return res;
1331 }
1332 EXPORT_SYMBOL(path_is_mountpoint);
1333 
1334 struct vfsmount *mnt_clone_internal(const struct path *path)
1335 {
1336 	struct mount *p;
1337 	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1338 	if (IS_ERR(p))
1339 		return ERR_CAST(p);
1340 	p->mnt.mnt_flags |= MNT_INTERNAL;
1341 	return &p->mnt;
1342 }
1343 
1344 #ifdef CONFIG_PROC_FS
1345 static struct mount *mnt_list_next(struct mnt_namespace *ns,
1346 				   struct list_head *p)
1347 {
1348 	struct mount *mnt, *ret = NULL;
1349 
1350 	lock_ns_list(ns);
1351 	list_for_each_continue(p, &ns->list) {
1352 		mnt = list_entry(p, typeof(*mnt), mnt_list);
1353 		if (!mnt_is_cursor(mnt)) {
1354 			ret = mnt;
1355 			break;
1356 		}
1357 	}
1358 	unlock_ns_list(ns);
1359 
1360 	return ret;
1361 }
1362 
1363 /* iterator; we want it to have access to namespace_sem, thus here... */
1364 static void *m_start(struct seq_file *m, loff_t *pos)
1365 {
1366 	struct proc_mounts *p = m->private;
1367 	struct list_head *prev;
1368 
1369 	down_read(&namespace_sem);
1370 	if (!*pos) {
1371 		prev = &p->ns->list;
1372 	} else {
1373 		prev = &p->cursor.mnt_list;
1374 
1375 		/* Read after we'd reached the end? */
1376 		if (list_empty(prev))
1377 			return NULL;
1378 	}
1379 
1380 	return mnt_list_next(p->ns, prev);
1381 }
1382 
1383 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1384 {
1385 	struct proc_mounts *p = m->private;
1386 	struct mount *mnt = v;
1387 
1388 	++*pos;
1389 	return mnt_list_next(p->ns, &mnt->mnt_list);
1390 }
1391 
1392 static void m_stop(struct seq_file *m, void *v)
1393 {
1394 	struct proc_mounts *p = m->private;
1395 	struct mount *mnt = v;
1396 
1397 	lock_ns_list(p->ns);
1398 	if (mnt)
1399 		list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
1400 	else
1401 		list_del_init(&p->cursor.mnt_list);
1402 	unlock_ns_list(p->ns);
1403 	up_read(&namespace_sem);
1404 }
1405 
1406 static int m_show(struct seq_file *m, void *v)
1407 {
1408 	struct proc_mounts *p = m->private;
1409 	struct mount *r = v;
1410 	return p->show(m, &r->mnt);
1411 }
1412 
1413 const struct seq_operations mounts_op = {
1414 	.start	= m_start,
1415 	.next	= m_next,
1416 	.stop	= m_stop,
1417 	.show	= m_show,
1418 };
1419 
1420 void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
1421 {
1422 	down_read(&namespace_sem);
1423 	lock_ns_list(ns);
1424 	list_del(&cursor->mnt_list);
1425 	unlock_ns_list(ns);
1426 	up_read(&namespace_sem);
1427 }
1428 #endif  /* CONFIG_PROC_FS */
1429 
1430 /**
1431  * may_umount_tree - check if a mount tree is busy
1432  * @m: root of mount tree
1433  *
1434  * This is called to check if a tree of mounts has any
1435  * open files, pwds, chroots or sub mounts that are
1436  * busy.
1437  */
1438 int may_umount_tree(struct vfsmount *m)
1439 {
1440 	struct mount *mnt = real_mount(m);
1441 	int actual_refs = 0;
1442 	int minimum_refs = 0;
1443 	struct mount *p;
1444 	BUG_ON(!m);
1445 
1446 	/* write lock needed for mnt_get_count */
1447 	lock_mount_hash();
1448 	for (p = mnt; p; p = next_mnt(p, mnt)) {
1449 		actual_refs += mnt_get_count(p);
1450 		minimum_refs += 2;
1451 	}
1452 	unlock_mount_hash();
1453 
1454 	if (actual_refs > minimum_refs)
1455 		return 0;
1456 
1457 	return 1;
1458 }
1459 
1460 EXPORT_SYMBOL(may_umount_tree);
1461 
1462 /**
1463  * may_umount - check if a mount point is busy
1464  * @mnt: root of mount
1465  *
1466  * This is called to check if a mount point has any
1467  * open files, pwds, chroots or sub mounts. If the
1468  * mount has sub mounts this will return busy
1469  * regardless of whether the sub mounts are busy.
1470  *
1471  * Doesn't take quota and stuff into account. IOW, in some cases it will
1472  * give false negatives. The main reason why it's here is that we need
1473  * a non-destructive way to look for easily umountable filesystems.
1474  */
1475 int may_umount(struct vfsmount *mnt)
1476 {
1477 	int ret = 1;
1478 	down_read(&namespace_sem);
1479 	lock_mount_hash();
1480 	if (propagate_mount_busy(real_mount(mnt), 2))
1481 		ret = 0;
1482 	unlock_mount_hash();
1483 	up_read(&namespace_sem);
1484 	return ret;
1485 }
1486 
1487 EXPORT_SYMBOL(may_umount);
1488 
1489 static void namespace_unlock(void)
1490 {
1491 	struct hlist_head head;
1492 	struct hlist_node *p;
1493 	struct mount *m;
1494 	LIST_HEAD(list);
1495 
1496 	hlist_move_list(&unmounted, &head);
1497 	list_splice_init(&ex_mountpoints, &list);
1498 
1499 	up_write(&namespace_sem);
1500 
1501 	shrink_dentry_list(&list);
1502 
1503 	if (likely(hlist_empty(&head)))
1504 		return;
1505 
1506 	synchronize_rcu_expedited();
1507 
1508 	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1509 		hlist_del(&m->mnt_umount);
1510 		mntput(&m->mnt);
1511 	}
1512 }
1513 
1514 static inline void namespace_lock(void)
1515 {
1516 	down_write(&namespace_sem);
1517 }
1518 
1519 enum umount_tree_flags {
1520 	UMOUNT_SYNC = 1,
1521 	UMOUNT_PROPAGATE = 2,
1522 	UMOUNT_CONNECTED = 4,
1523 };
1524 
1525 static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
1526 {
1527 	/* Leaving mounts connected is only valid for lazy umounts */
1528 	if (how & UMOUNT_SYNC)
1529 		return true;
1530 
1531 	/* A mount without a parent has nothing to be connected to */
1532 	if (!mnt_has_parent(mnt))
1533 		return true;
1534 
1535 	/* Because the reference counting rules change when mounts are
1536 	 * unmounted and connected, umounted mounts may not be
1537 	 * connected to mounted mounts.
1538 	 */
1539 	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1540 		return true;
1541 
1542 	/* Has it been requested that the mount remain connected? */
1543 	if (how & UMOUNT_CONNECTED)
1544 		return false;
1545 
1546 	/* Is the mount locked such that it needs to remain connected? */
1547 	if (IS_MNT_LOCKED(mnt))
1548 		return false;
1549 
1550 	/* By default disconnect the mount */
1551 	return true;
1552 }
1553 
1554 /*
1555  * mount_lock must be held
1556  * namespace_sem must be held for write
1557  */
1558 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1559 {
1560 	LIST_HEAD(tmp_list);
1561 	struct mount *p;
1562 
1563 	if (how & UMOUNT_PROPAGATE)
1564 		propagate_mount_unlock(mnt);
1565 
1566 	/* Gather the mounts to umount */
1567 	for (p = mnt; p; p = next_mnt(p, mnt)) {
1568 		p->mnt.mnt_flags |= MNT_UMOUNT;
1569 		list_move(&p->mnt_list, &tmp_list);
1570 	}
1571 
1572 	/* Hide the mounts from mnt_mounts */
1573 	list_for_each_entry(p, &tmp_list, mnt_list) {
1574 		list_del_init(&p->mnt_child);
1575 	}
1576 
1577 	/* Add propogated mounts to the tmp_list */
1578 	if (how & UMOUNT_PROPAGATE)
1579 		propagate_umount(&tmp_list);
1580 
1581 	while (!list_empty(&tmp_list)) {
1582 		struct mnt_namespace *ns;
1583 		bool disconnect;
1584 		p = list_first_entry(&tmp_list, struct mount, mnt_list);
1585 		list_del_init(&p->mnt_expire);
1586 		list_del_init(&p->mnt_list);
1587 		ns = p->mnt_ns;
1588 		if (ns) {
1589 			ns->mounts--;
1590 			__touch_mnt_namespace(ns);
1591 		}
1592 		p->mnt_ns = NULL;
1593 		if (how & UMOUNT_SYNC)
1594 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1595 
1596 		disconnect = disconnect_mount(p, how);
1597 		if (mnt_has_parent(p)) {
1598 			mnt_add_count(p->mnt_parent, -1);
1599 			if (!disconnect) {
1600 				/* Don't forget about p */
1601 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1602 			} else {
1603 				umount_mnt(p);
1604 			}
1605 		}
1606 		change_mnt_propagation(p, MS_PRIVATE);
1607 		if (disconnect)
1608 			hlist_add_head(&p->mnt_umount, &unmounted);
1609 	}
1610 }
1611 
1612 static void shrink_submounts(struct mount *mnt);
1613 
1614 static int do_umount_root(struct super_block *sb)
1615 {
1616 	int ret = 0;
1617 
1618 	down_write(&sb->s_umount);
1619 	if (!sb_rdonly(sb)) {
1620 		struct fs_context *fc;
1621 
1622 		fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
1623 						SB_RDONLY);
1624 		if (IS_ERR(fc)) {
1625 			ret = PTR_ERR(fc);
1626 		} else {
1627 			ret = parse_monolithic_mount_data(fc, NULL);
1628 			if (!ret)
1629 				ret = reconfigure_super(fc);
1630 			put_fs_context(fc);
1631 		}
1632 	}
1633 	up_write(&sb->s_umount);
1634 	return ret;
1635 }
1636 
1637 static int do_umount(struct mount *mnt, int flags)
1638 {
1639 	struct super_block *sb = mnt->mnt.mnt_sb;
1640 	int retval;
1641 
1642 	retval = security_sb_umount(&mnt->mnt, flags);
1643 	if (retval)
1644 		return retval;
1645 
1646 	/*
1647 	 * Allow userspace to request a mountpoint be expired rather than
1648 	 * unmounting unconditionally. Unmount only happens if:
1649 	 *  (1) the mark is already set (the mark is cleared by mntput())
1650 	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1651 	 */
1652 	if (flags & MNT_EXPIRE) {
1653 		if (&mnt->mnt == current->fs->root.mnt ||
1654 		    flags & (MNT_FORCE | MNT_DETACH))
1655 			return -EINVAL;
1656 
1657 		/*
1658 		 * probably don't strictly need the lock here if we examined
1659 		 * all race cases, but it's a slowpath.
1660 		 */
1661 		lock_mount_hash();
1662 		if (mnt_get_count(mnt) != 2) {
1663 			unlock_mount_hash();
1664 			return -EBUSY;
1665 		}
1666 		unlock_mount_hash();
1667 
1668 		if (!xchg(&mnt->mnt_expiry_mark, 1))
1669 			return -EAGAIN;
1670 	}
1671 
1672 	/*
1673 	 * If we may have to abort operations to get out of this
1674 	 * mount, and they will themselves hold resources we must
1675 	 * allow the fs to do things. In the Unix tradition of
1676 	 * 'Gee thats tricky lets do it in userspace' the umount_begin
1677 	 * might fail to complete on the first run through as other tasks
1678 	 * must return, and the like. Thats for the mount program to worry
1679 	 * about for the moment.
1680 	 */
1681 
1682 	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1683 		sb->s_op->umount_begin(sb);
1684 	}
1685 
1686 	/*
1687 	 * No sense to grab the lock for this test, but test itself looks
1688 	 * somewhat bogus. Suggestions for better replacement?
1689 	 * Ho-hum... In principle, we might treat that as umount + switch
1690 	 * to rootfs. GC would eventually take care of the old vfsmount.
1691 	 * Actually it makes sense, especially if rootfs would contain a
1692 	 * /reboot - static binary that would close all descriptors and
1693 	 * call reboot(9). Then init(8) could umount root and exec /reboot.
1694 	 */
1695 	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1696 		/*
1697 		 * Special case for "unmounting" root ...
1698 		 * we just try to remount it readonly.
1699 		 */
1700 		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1701 			return -EPERM;
1702 		return do_umount_root(sb);
1703 	}
1704 
1705 	namespace_lock();
1706 	lock_mount_hash();
1707 
1708 	/* Recheck MNT_LOCKED with the locks held */
1709 	retval = -EINVAL;
1710 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
1711 		goto out;
1712 
1713 	event++;
1714 	if (flags & MNT_DETACH) {
1715 		if (!list_empty(&mnt->mnt_list))
1716 			umount_tree(mnt, UMOUNT_PROPAGATE);
1717 		retval = 0;
1718 	} else {
1719 		shrink_submounts(mnt);
1720 		retval = -EBUSY;
1721 		if (!propagate_mount_busy(mnt, 2)) {
1722 			if (!list_empty(&mnt->mnt_list))
1723 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1724 			retval = 0;
1725 		}
1726 	}
1727 out:
1728 	unlock_mount_hash();
1729 	namespace_unlock();
1730 	return retval;
1731 }
1732 
1733 /*
1734  * __detach_mounts - lazily unmount all mounts on the specified dentry
1735  *
1736  * During unlink, rmdir, and d_drop it is possible to loose the path
1737  * to an existing mountpoint, and wind up leaking the mount.
1738  * detach_mounts allows lazily unmounting those mounts instead of
1739  * leaking them.
1740  *
1741  * The caller may hold dentry->d_inode->i_mutex.
1742  */
1743 void __detach_mounts(struct dentry *dentry)
1744 {
1745 	struct mountpoint *mp;
1746 	struct mount *mnt;
1747 
1748 	namespace_lock();
1749 	lock_mount_hash();
1750 	mp = lookup_mountpoint(dentry);
1751 	if (!mp)
1752 		goto out_unlock;
1753 
1754 	event++;
1755 	while (!hlist_empty(&mp->m_list)) {
1756 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1757 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1758 			umount_mnt(mnt);
1759 			hlist_add_head(&mnt->mnt_umount, &unmounted);
1760 		}
1761 		else umount_tree(mnt, UMOUNT_CONNECTED);
1762 	}
1763 	put_mountpoint(mp);
1764 out_unlock:
1765 	unlock_mount_hash();
1766 	namespace_unlock();
1767 }
1768 
1769 /*
1770  * Is the caller allowed to modify his namespace?
1771  */
1772 bool may_mount(void)
1773 {
1774 	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1775 }
1776 
1777 static void warn_mandlock(void)
1778 {
1779 	pr_warn_once("=======================================================\n"
1780 		     "WARNING: The mand mount option has been deprecated and\n"
1781 		     "         and is ignored by this kernel. Remove the mand\n"
1782 		     "         option from the mount to silence this warning.\n"
1783 		     "=======================================================\n");
1784 }
1785 
1786 static int can_umount(const struct path *path, int flags)
1787 {
1788 	struct mount *mnt = real_mount(path->mnt);
1789 
1790 	if (!may_mount())
1791 		return -EPERM;
1792 	if (path->dentry != path->mnt->mnt_root)
1793 		return -EINVAL;
1794 	if (!check_mnt(mnt))
1795 		return -EINVAL;
1796 	if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1797 		return -EINVAL;
1798 	if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1799 		return -EPERM;
1800 	return 0;
1801 }
1802 
1803 // caller is responsible for flags being sane
1804 int path_umount(struct path *path, int flags)
1805 {
1806 	struct mount *mnt = real_mount(path->mnt);
1807 	int ret;
1808 
1809 	ret = can_umount(path, flags);
1810 	if (!ret)
1811 		ret = do_umount(mnt, flags);
1812 
1813 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
1814 	dput(path->dentry);
1815 	mntput_no_expire(mnt);
1816 	return ret;
1817 }
1818 
1819 static int ksys_umount(char __user *name, int flags)
1820 {
1821 	int lookup_flags = LOOKUP_MOUNTPOINT;
1822 	struct path path;
1823 	int ret;
1824 
1825 	// basic validity checks done first
1826 	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1827 		return -EINVAL;
1828 
1829 	if (!(flags & UMOUNT_NOFOLLOW))
1830 		lookup_flags |= LOOKUP_FOLLOW;
1831 	ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1832 	if (ret)
1833 		return ret;
1834 	return path_umount(&path, flags);
1835 }
1836 
1837 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1838 {
1839 	return ksys_umount(name, flags);
1840 }
1841 
1842 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1843 
1844 /*
1845  *	The 2.0 compatible umount. No flags.
1846  */
1847 SYSCALL_DEFINE1(oldumount, char __user *, name)
1848 {
1849 	return ksys_umount(name, 0);
1850 }
1851 
1852 #endif
1853 
1854 static bool is_mnt_ns_file(struct dentry *dentry)
1855 {
1856 	/* Is this a proxy for a mount namespace? */
1857 	return dentry->d_op == &ns_dentry_operations &&
1858 	       dentry->d_fsdata == &mntns_operations;
1859 }
1860 
1861 static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
1862 {
1863 	return container_of(ns, struct mnt_namespace, ns);
1864 }
1865 
1866 struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
1867 {
1868 	return &mnt->ns;
1869 }
1870 
1871 static bool mnt_ns_loop(struct dentry *dentry)
1872 {
1873 	/* Could bind mounting the mount namespace inode cause a
1874 	 * mount namespace loop?
1875 	 */
1876 	struct mnt_namespace *mnt_ns;
1877 	if (!is_mnt_ns_file(dentry))
1878 		return false;
1879 
1880 	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
1881 	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1882 }
1883 
1884 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1885 					int flag)
1886 {
1887 	struct mount *res, *p, *q, *r, *parent;
1888 
1889 	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1890 		return ERR_PTR(-EINVAL);
1891 
1892 	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1893 		return ERR_PTR(-EINVAL);
1894 
1895 	res = q = clone_mnt(mnt, dentry, flag);
1896 	if (IS_ERR(q))
1897 		return q;
1898 
1899 	q->mnt_mountpoint = mnt->mnt_mountpoint;
1900 
1901 	p = mnt;
1902 	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1903 		struct mount *s;
1904 		if (!is_subdir(r->mnt_mountpoint, dentry))
1905 			continue;
1906 
1907 		for (s = r; s; s = next_mnt(s, r)) {
1908 			if (!(flag & CL_COPY_UNBINDABLE) &&
1909 			    IS_MNT_UNBINDABLE(s)) {
1910 				if (s->mnt.mnt_flags & MNT_LOCKED) {
1911 					/* Both unbindable and locked. */
1912 					q = ERR_PTR(-EPERM);
1913 					goto out;
1914 				} else {
1915 					s = skip_mnt_tree(s);
1916 					continue;
1917 				}
1918 			}
1919 			if (!(flag & CL_COPY_MNT_NS_FILE) &&
1920 			    is_mnt_ns_file(s->mnt.mnt_root)) {
1921 				s = skip_mnt_tree(s);
1922 				continue;
1923 			}
1924 			while (p != s->mnt_parent) {
1925 				p = p->mnt_parent;
1926 				q = q->mnt_parent;
1927 			}
1928 			p = s;
1929 			parent = q;
1930 			q = clone_mnt(p, p->mnt.mnt_root, flag);
1931 			if (IS_ERR(q))
1932 				goto out;
1933 			lock_mount_hash();
1934 			list_add_tail(&q->mnt_list, &res->mnt_list);
1935 			attach_mnt(q, parent, p->mnt_mp);
1936 			unlock_mount_hash();
1937 		}
1938 	}
1939 	return res;
1940 out:
1941 	if (res) {
1942 		lock_mount_hash();
1943 		umount_tree(res, UMOUNT_SYNC);
1944 		unlock_mount_hash();
1945 	}
1946 	return q;
1947 }
1948 
1949 /* Caller should check returned pointer for errors */
1950 
1951 struct vfsmount *collect_mounts(const struct path *path)
1952 {
1953 	struct mount *tree;
1954 	namespace_lock();
1955 	if (!check_mnt(real_mount(path->mnt)))
1956 		tree = ERR_PTR(-EINVAL);
1957 	else
1958 		tree = copy_tree(real_mount(path->mnt), path->dentry,
1959 				 CL_COPY_ALL | CL_PRIVATE);
1960 	namespace_unlock();
1961 	if (IS_ERR(tree))
1962 		return ERR_CAST(tree);
1963 	return &tree->mnt;
1964 }
1965 
1966 static void free_mnt_ns(struct mnt_namespace *);
1967 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
1968 
1969 void dissolve_on_fput(struct vfsmount *mnt)
1970 {
1971 	struct mnt_namespace *ns;
1972 	namespace_lock();
1973 	lock_mount_hash();
1974 	ns = real_mount(mnt)->mnt_ns;
1975 	if (ns) {
1976 		if (is_anon_ns(ns))
1977 			umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
1978 		else
1979 			ns = NULL;
1980 	}
1981 	unlock_mount_hash();
1982 	namespace_unlock();
1983 	if (ns)
1984 		free_mnt_ns(ns);
1985 }
1986 
1987 void drop_collected_mounts(struct vfsmount *mnt)
1988 {
1989 	namespace_lock();
1990 	lock_mount_hash();
1991 	umount_tree(real_mount(mnt), 0);
1992 	unlock_mount_hash();
1993 	namespace_unlock();
1994 }
1995 
1996 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
1997 {
1998 	struct mount *child;
1999 
2000 	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2001 		if (!is_subdir(child->mnt_mountpoint, dentry))
2002 			continue;
2003 
2004 		if (child->mnt.mnt_flags & MNT_LOCKED)
2005 			return true;
2006 	}
2007 	return false;
2008 }
2009 
2010 /**
2011  * clone_private_mount - create a private clone of a path
2012  * @path: path to clone
2013  *
2014  * This creates a new vfsmount, which will be the clone of @path.  The new mount
2015  * will not be attached anywhere in the namespace and will be private (i.e.
2016  * changes to the originating mount won't be propagated into this).
2017  *
2018  * Release with mntput().
2019  */
2020 struct vfsmount *clone_private_mount(const struct path *path)
2021 {
2022 	struct mount *old_mnt = real_mount(path->mnt);
2023 	struct mount *new_mnt;
2024 
2025 	down_read(&namespace_sem);
2026 	if (IS_MNT_UNBINDABLE(old_mnt))
2027 		goto invalid;
2028 
2029 	if (!check_mnt(old_mnt))
2030 		goto invalid;
2031 
2032 	if (has_locked_children(old_mnt, path->dentry))
2033 		goto invalid;
2034 
2035 	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
2036 	up_read(&namespace_sem);
2037 
2038 	if (IS_ERR(new_mnt))
2039 		return ERR_CAST(new_mnt);
2040 
2041 	/* Longterm mount to be removed by kern_unmount*() */
2042 	new_mnt->mnt_ns = MNT_NS_INTERNAL;
2043 
2044 	return &new_mnt->mnt;
2045 
2046 invalid:
2047 	up_read(&namespace_sem);
2048 	return ERR_PTR(-EINVAL);
2049 }
2050 EXPORT_SYMBOL_GPL(clone_private_mount);
2051 
2052 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
2053 		   struct vfsmount *root)
2054 {
2055 	struct mount *mnt;
2056 	int res = f(root, arg);
2057 	if (res)
2058 		return res;
2059 	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
2060 		res = f(&mnt->mnt, arg);
2061 		if (res)
2062 			return res;
2063 	}
2064 	return 0;
2065 }
2066 
2067 static void lock_mnt_tree(struct mount *mnt)
2068 {
2069 	struct mount *p;
2070 
2071 	for (p = mnt; p; p = next_mnt(p, mnt)) {
2072 		int flags = p->mnt.mnt_flags;
2073 		/* Don't allow unprivileged users to change mount flags */
2074 		flags |= MNT_LOCK_ATIME;
2075 
2076 		if (flags & MNT_READONLY)
2077 			flags |= MNT_LOCK_READONLY;
2078 
2079 		if (flags & MNT_NODEV)
2080 			flags |= MNT_LOCK_NODEV;
2081 
2082 		if (flags & MNT_NOSUID)
2083 			flags |= MNT_LOCK_NOSUID;
2084 
2085 		if (flags & MNT_NOEXEC)
2086 			flags |= MNT_LOCK_NOEXEC;
2087 		/* Don't allow unprivileged users to reveal what is under a mount */
2088 		if (list_empty(&p->mnt_expire))
2089 			flags |= MNT_LOCKED;
2090 		p->mnt.mnt_flags = flags;
2091 	}
2092 }
2093 
2094 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
2095 {
2096 	struct mount *p;
2097 
2098 	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
2099 		if (p->mnt_group_id && !IS_MNT_SHARED(p))
2100 			mnt_release_group_id(p);
2101 	}
2102 }
2103 
2104 static int invent_group_ids(struct mount *mnt, bool recurse)
2105 {
2106 	struct mount *p;
2107 
2108 	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
2109 		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2110 			int err = mnt_alloc_group_id(p);
2111 			if (err) {
2112 				cleanup_group_ids(mnt, p);
2113 				return err;
2114 			}
2115 		}
2116 	}
2117 
2118 	return 0;
2119 }
2120 
2121 int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
2122 {
2123 	unsigned int max = READ_ONCE(sysctl_mount_max);
2124 	unsigned int mounts = 0;
2125 	struct mount *p;
2126 
2127 	if (ns->mounts >= max)
2128 		return -ENOSPC;
2129 	max -= ns->mounts;
2130 	if (ns->pending_mounts >= max)
2131 		return -ENOSPC;
2132 	max -= ns->pending_mounts;
2133 
2134 	for (p = mnt; p; p = next_mnt(p, mnt))
2135 		mounts++;
2136 
2137 	if (mounts > max)
2138 		return -ENOSPC;
2139 
2140 	ns->pending_mounts += mounts;
2141 	return 0;
2142 }
2143 
2144 /*
2145  *  @source_mnt : mount tree to be attached
2146  *  @nd         : place the mount tree @source_mnt is attached
2147  *  @parent_nd  : if non-null, detach the source_mnt from its parent and
2148  *  		   store the parent mount and mountpoint dentry.
2149  *  		   (done when source_mnt is moved)
2150  *
2151  *  NOTE: in the table below explains the semantics when a source mount
2152  *  of a given type is attached to a destination mount of a given type.
2153  * ---------------------------------------------------------------------------
2154  * |         BIND MOUNT OPERATION                                            |
2155  * |**************************************************************************
2156  * | source-->| shared        |       private  |       slave    | unbindable |
2157  * | dest     |               |                |                |            |
2158  * |   |      |               |                |                |            |
2159  * |   v      |               |                |                |            |
2160  * |**************************************************************************
2161  * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
2162  * |          |               |                |                |            |
2163  * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
2164  * ***************************************************************************
2165  * A bind operation clones the source mount and mounts the clone on the
2166  * destination mount.
2167  *
2168  * (++)  the cloned mount is propagated to all the mounts in the propagation
2169  * 	 tree of the destination mount and the cloned mount is added to
2170  * 	 the peer group of the source mount.
2171  * (+)   the cloned mount is created under the destination mount and is marked
2172  *       as shared. The cloned mount is added to the peer group of the source
2173  *       mount.
2174  * (+++) the mount is propagated to all the mounts in the propagation tree
2175  *       of the destination mount and the cloned mount is made slave
2176  *       of the same master as that of the source mount. The cloned mount
2177  *       is marked as 'shared and slave'.
2178  * (*)   the cloned mount is made a slave of the same master as that of the
2179  * 	 source mount.
2180  *
2181  * ---------------------------------------------------------------------------
2182  * |         		MOVE MOUNT OPERATION                                 |
2183  * |**************************************************************************
2184  * | source-->| shared        |       private  |       slave    | unbindable |
2185  * | dest     |               |                |                |            |
2186  * |   |      |               |                |                |            |
2187  * |   v      |               |                |                |            |
2188  * |**************************************************************************
2189  * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
2190  * |          |               |                |                |            |
2191  * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
2192  * ***************************************************************************
2193  *
2194  * (+)  the mount is moved to the destination. And is then propagated to
2195  * 	all the mounts in the propagation tree of the destination mount.
2196  * (+*)  the mount is moved to the destination.
2197  * (+++)  the mount is moved to the destination and is then propagated to
2198  * 	all the mounts belonging to the destination mount's propagation tree.
2199  * 	the mount is marked as 'shared and slave'.
2200  * (*)	the mount continues to be a slave at the new location.
2201  *
2202  * if the source mount is a tree, the operations explained above is
2203  * applied to each mount in the tree.
2204  * Must be called without spinlocks held, since this function can sleep
2205  * in allocations.
2206  */
2207 static int attach_recursive_mnt(struct mount *source_mnt,
2208 			struct mount *dest_mnt,
2209 			struct mountpoint *dest_mp,
2210 			bool moving)
2211 {
2212 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2213 	HLIST_HEAD(tree_list);
2214 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
2215 	struct mountpoint *smp;
2216 	struct mount *child, *p;
2217 	struct hlist_node *n;
2218 	int err;
2219 
2220 	/* Preallocate a mountpoint in case the new mounts need
2221 	 * to be tucked under other mounts.
2222 	 */
2223 	smp = get_mountpoint(source_mnt->mnt.mnt_root);
2224 	if (IS_ERR(smp))
2225 		return PTR_ERR(smp);
2226 
2227 	/* Is there space to add these mounts to the mount namespace? */
2228 	if (!moving) {
2229 		err = count_mounts(ns, source_mnt);
2230 		if (err)
2231 			goto out;
2232 	}
2233 
2234 	if (IS_MNT_SHARED(dest_mnt)) {
2235 		err = invent_group_ids(source_mnt, true);
2236 		if (err)
2237 			goto out;
2238 		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2239 		lock_mount_hash();
2240 		if (err)
2241 			goto out_cleanup_ids;
2242 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2243 			set_mnt_shared(p);
2244 	} else {
2245 		lock_mount_hash();
2246 	}
2247 	if (moving) {
2248 		unhash_mnt(source_mnt);
2249 		attach_mnt(source_mnt, dest_mnt, dest_mp);
2250 		touch_mnt_namespace(source_mnt->mnt_ns);
2251 	} else {
2252 		if (source_mnt->mnt_ns) {
2253 			/* move from anon - the caller will destroy */
2254 			list_del_init(&source_mnt->mnt_ns->list);
2255 		}
2256 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2257 		commit_tree(source_mnt);
2258 	}
2259 
2260 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2261 		struct mount *q;
2262 		hlist_del_init(&child->mnt_hash);
2263 		q = __lookup_mnt(&child->mnt_parent->mnt,
2264 				 child->mnt_mountpoint);
2265 		if (q)
2266 			mnt_change_mountpoint(child, smp, q);
2267 		/* Notice when we are propagating across user namespaces */
2268 		if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2269 			lock_mnt_tree(child);
2270 		child->mnt.mnt_flags &= ~MNT_LOCKED;
2271 		commit_tree(child);
2272 	}
2273 	put_mountpoint(smp);
2274 	unlock_mount_hash();
2275 
2276 	return 0;
2277 
2278  out_cleanup_ids:
2279 	while (!hlist_empty(&tree_list)) {
2280 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2281 		child->mnt_parent->mnt_ns->pending_mounts = 0;
2282 		umount_tree(child, UMOUNT_SYNC);
2283 	}
2284 	unlock_mount_hash();
2285 	cleanup_group_ids(source_mnt, NULL);
2286  out:
2287 	ns->pending_mounts = 0;
2288 
2289 	read_seqlock_excl(&mount_lock);
2290 	put_mountpoint(smp);
2291 	read_sequnlock_excl(&mount_lock);
2292 
2293 	return err;
2294 }
2295 
2296 static struct mountpoint *lock_mount(struct path *path)
2297 {
2298 	struct vfsmount *mnt;
2299 	struct dentry *dentry = path->dentry;
2300 retry:
2301 	inode_lock(dentry->d_inode);
2302 	if (unlikely(cant_mount(dentry))) {
2303 		inode_unlock(dentry->d_inode);
2304 		return ERR_PTR(-ENOENT);
2305 	}
2306 	namespace_lock();
2307 	mnt = lookup_mnt(path);
2308 	if (likely(!mnt)) {
2309 		struct mountpoint *mp = get_mountpoint(dentry);
2310 		if (IS_ERR(mp)) {
2311 			namespace_unlock();
2312 			inode_unlock(dentry->d_inode);
2313 			return mp;
2314 		}
2315 		return mp;
2316 	}
2317 	namespace_unlock();
2318 	inode_unlock(path->dentry->d_inode);
2319 	path_put(path);
2320 	path->mnt = mnt;
2321 	dentry = path->dentry = dget(mnt->mnt_root);
2322 	goto retry;
2323 }
2324 
2325 static void unlock_mount(struct mountpoint *where)
2326 {
2327 	struct dentry *dentry = where->m_dentry;
2328 
2329 	read_seqlock_excl(&mount_lock);
2330 	put_mountpoint(where);
2331 	read_sequnlock_excl(&mount_lock);
2332 
2333 	namespace_unlock();
2334 	inode_unlock(dentry->d_inode);
2335 }
2336 
2337 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
2338 {
2339 	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2340 		return -EINVAL;
2341 
2342 	if (d_is_dir(mp->m_dentry) !=
2343 	      d_is_dir(mnt->mnt.mnt_root))
2344 		return -ENOTDIR;
2345 
2346 	return attach_recursive_mnt(mnt, p, mp, false);
2347 }
2348 
2349 /*
2350  * Sanity check the flags to change_mnt_propagation.
2351  */
2352 
2353 static int flags_to_propagation_type(int ms_flags)
2354 {
2355 	int type = ms_flags & ~(MS_REC | MS_SILENT);
2356 
2357 	/* Fail if any non-propagation flags are set */
2358 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2359 		return 0;
2360 	/* Only one propagation flag should be set */
2361 	if (!is_power_of_2(type))
2362 		return 0;
2363 	return type;
2364 }
2365 
2366 /*
2367  * recursively change the type of the mountpoint.
2368  */
2369 static int do_change_type(struct path *path, int ms_flags)
2370 {
2371 	struct mount *m;
2372 	struct mount *mnt = real_mount(path->mnt);
2373 	int recurse = ms_flags & MS_REC;
2374 	int type;
2375 	int err = 0;
2376 
2377 	if (path->dentry != path->mnt->mnt_root)
2378 		return -EINVAL;
2379 
2380 	type = flags_to_propagation_type(ms_flags);
2381 	if (!type)
2382 		return -EINVAL;
2383 
2384 	namespace_lock();
2385 	if (type == MS_SHARED) {
2386 		err = invent_group_ids(mnt, recurse);
2387 		if (err)
2388 			goto out_unlock;
2389 	}
2390 
2391 	lock_mount_hash();
2392 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
2393 		change_mnt_propagation(m, type);
2394 	unlock_mount_hash();
2395 
2396  out_unlock:
2397 	namespace_unlock();
2398 	return err;
2399 }
2400 
2401 static struct mount *__do_loopback(struct path *old_path, int recurse)
2402 {
2403 	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
2404 
2405 	if (IS_MNT_UNBINDABLE(old))
2406 		return mnt;
2407 
2408 	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2409 		return mnt;
2410 
2411 	if (!recurse && has_locked_children(old, old_path->dentry))
2412 		return mnt;
2413 
2414 	if (recurse)
2415 		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2416 	else
2417 		mnt = clone_mnt(old, old_path->dentry, 0);
2418 
2419 	if (!IS_ERR(mnt))
2420 		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2421 
2422 	return mnt;
2423 }
2424 
2425 /*
2426  * do loopback mount.
2427  */
2428 static int do_loopback(struct path *path, const char *old_name,
2429 				int recurse)
2430 {
2431 	struct path old_path;
2432 	struct mount *mnt = NULL, *parent;
2433 	struct mountpoint *mp;
2434 	int err;
2435 	if (!old_name || !*old_name)
2436 		return -EINVAL;
2437 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2438 	if (err)
2439 		return err;
2440 
2441 	err = -EINVAL;
2442 	if (mnt_ns_loop(old_path.dentry))
2443 		goto out;
2444 
2445 	mp = lock_mount(path);
2446 	if (IS_ERR(mp)) {
2447 		err = PTR_ERR(mp);
2448 		goto out;
2449 	}
2450 
2451 	parent = real_mount(path->mnt);
2452 	if (!check_mnt(parent))
2453 		goto out2;
2454 
2455 	mnt = __do_loopback(&old_path, recurse);
2456 	if (IS_ERR(mnt)) {
2457 		err = PTR_ERR(mnt);
2458 		goto out2;
2459 	}
2460 
2461 	err = graft_tree(mnt, parent, mp);
2462 	if (err) {
2463 		lock_mount_hash();
2464 		umount_tree(mnt, UMOUNT_SYNC);
2465 		unlock_mount_hash();
2466 	}
2467 out2:
2468 	unlock_mount(mp);
2469 out:
2470 	path_put(&old_path);
2471 	return err;
2472 }
2473 
2474 static struct file *open_detached_copy(struct path *path, bool recursive)
2475 {
2476 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2477 	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
2478 	struct mount *mnt, *p;
2479 	struct file *file;
2480 
2481 	if (IS_ERR(ns))
2482 		return ERR_CAST(ns);
2483 
2484 	namespace_lock();
2485 	mnt = __do_loopback(path, recursive);
2486 	if (IS_ERR(mnt)) {
2487 		namespace_unlock();
2488 		free_mnt_ns(ns);
2489 		return ERR_CAST(mnt);
2490 	}
2491 
2492 	lock_mount_hash();
2493 	for (p = mnt; p; p = next_mnt(p, mnt)) {
2494 		p->mnt_ns = ns;
2495 		ns->mounts++;
2496 	}
2497 	ns->root = mnt;
2498 	list_add_tail(&ns->list, &mnt->mnt_list);
2499 	mntget(&mnt->mnt);
2500 	unlock_mount_hash();
2501 	namespace_unlock();
2502 
2503 	mntput(path->mnt);
2504 	path->mnt = &mnt->mnt;
2505 	file = dentry_open(path, O_PATH, current_cred());
2506 	if (IS_ERR(file))
2507 		dissolve_on_fput(path->mnt);
2508 	else
2509 		file->f_mode |= FMODE_NEED_UNMOUNT;
2510 	return file;
2511 }
2512 
2513 SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
2514 {
2515 	struct file *file;
2516 	struct path path;
2517 	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
2518 	bool detached = flags & OPEN_TREE_CLONE;
2519 	int error;
2520 	int fd;
2521 
2522 	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
2523 
2524 	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
2525 		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
2526 		      OPEN_TREE_CLOEXEC))
2527 		return -EINVAL;
2528 
2529 	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
2530 		return -EINVAL;
2531 
2532 	if (flags & AT_NO_AUTOMOUNT)
2533 		lookup_flags &= ~LOOKUP_AUTOMOUNT;
2534 	if (flags & AT_SYMLINK_NOFOLLOW)
2535 		lookup_flags &= ~LOOKUP_FOLLOW;
2536 	if (flags & AT_EMPTY_PATH)
2537 		lookup_flags |= LOOKUP_EMPTY;
2538 
2539 	if (detached && !may_mount())
2540 		return -EPERM;
2541 
2542 	fd = get_unused_fd_flags(flags & O_CLOEXEC);
2543 	if (fd < 0)
2544 		return fd;
2545 
2546 	error = user_path_at(dfd, filename, lookup_flags, &path);
2547 	if (unlikely(error)) {
2548 		file = ERR_PTR(error);
2549 	} else {
2550 		if (detached)
2551 			file = open_detached_copy(&path, flags & AT_RECURSIVE);
2552 		else
2553 			file = dentry_open(&path, O_PATH, current_cred());
2554 		path_put(&path);
2555 	}
2556 	if (IS_ERR(file)) {
2557 		put_unused_fd(fd);
2558 		return PTR_ERR(file);
2559 	}
2560 	fd_install(fd, file);
2561 	return fd;
2562 }
2563 
2564 /*
2565  * Don't allow locked mount flags to be cleared.
2566  *
2567  * No locks need to be held here while testing the various MNT_LOCK
2568  * flags because those flags can never be cleared once they are set.
2569  */
2570 static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
2571 {
2572 	unsigned int fl = mnt->mnt.mnt_flags;
2573 
2574 	if ((fl & MNT_LOCK_READONLY) &&
2575 	    !(mnt_flags & MNT_READONLY))
2576 		return false;
2577 
2578 	if ((fl & MNT_LOCK_NODEV) &&
2579 	    !(mnt_flags & MNT_NODEV))
2580 		return false;
2581 
2582 	if ((fl & MNT_LOCK_NOSUID) &&
2583 	    !(mnt_flags & MNT_NOSUID))
2584 		return false;
2585 
2586 	if ((fl & MNT_LOCK_NOEXEC) &&
2587 	    !(mnt_flags & MNT_NOEXEC))
2588 		return false;
2589 
2590 	if ((fl & MNT_LOCK_ATIME) &&
2591 	    ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
2592 		return false;
2593 
2594 	return true;
2595 }
2596 
2597 static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
2598 {
2599 	bool readonly_request = (mnt_flags & MNT_READONLY);
2600 
2601 	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2602 		return 0;
2603 
2604 	if (readonly_request)
2605 		return mnt_make_readonly(mnt);
2606 
2607 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
2608 	return 0;
2609 }
2610 
2611 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
2612 {
2613 	mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2614 	mnt->mnt.mnt_flags = mnt_flags;
2615 	touch_mnt_namespace(mnt->mnt_ns);
2616 }
2617 
2618 static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
2619 {
2620 	struct super_block *sb = mnt->mnt_sb;
2621 
2622 	if (!__mnt_is_readonly(mnt) &&
2623 	   (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
2624 	   (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
2625 		char *buf = (char *)__get_free_page(GFP_KERNEL);
2626 		char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
2627 
2628 		pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
2629 			sb->s_type->name,
2630 			is_mounted(mnt) ? "remounted" : "mounted",
2631 			mntpath, &sb->s_time_max,
2632 			(unsigned long long)sb->s_time_max);
2633 
2634 		free_page((unsigned long)buf);
2635 		sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
2636 	}
2637 }
2638 
2639 /*
2640  * Handle reconfiguration of the mountpoint only without alteration of the
2641  * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
2642  * to mount(2).
2643  */
2644 static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
2645 {
2646 	struct super_block *sb = path->mnt->mnt_sb;
2647 	struct mount *mnt = real_mount(path->mnt);
2648 	int ret;
2649 
2650 	if (!check_mnt(mnt))
2651 		return -EINVAL;
2652 
2653 	if (path->dentry != mnt->mnt.mnt_root)
2654 		return -EINVAL;
2655 
2656 	if (!can_change_locked_flags(mnt, mnt_flags))
2657 		return -EPERM;
2658 
2659 	/*
2660 	 * We're only checking whether the superblock is read-only not
2661 	 * changing it, so only take down_read(&sb->s_umount).
2662 	 */
2663 	down_read(&sb->s_umount);
2664 	lock_mount_hash();
2665 	ret = change_mount_ro_state(mnt, mnt_flags);
2666 	if (ret == 0)
2667 		set_mount_attributes(mnt, mnt_flags);
2668 	unlock_mount_hash();
2669 	up_read(&sb->s_umount);
2670 
2671 	mnt_warn_timestamp_expiry(path, &mnt->mnt);
2672 
2673 	return ret;
2674 }
2675 
2676 /*
2677  * change filesystem flags. dir should be a physical root of filesystem.
2678  * If you've mounted a non-root directory somewhere and want to do remount
2679  * on it - tough luck.
2680  */
2681 static int do_remount(struct path *path, int ms_flags, int sb_flags,
2682 		      int mnt_flags, void *data)
2683 {
2684 	int err;
2685 	struct super_block *sb = path->mnt->mnt_sb;
2686 	struct mount *mnt = real_mount(path->mnt);
2687 	struct fs_context *fc;
2688 
2689 	if (!check_mnt(mnt))
2690 		return -EINVAL;
2691 
2692 	if (path->dentry != path->mnt->mnt_root)
2693 		return -EINVAL;
2694 
2695 	if (!can_change_locked_flags(mnt, mnt_flags))
2696 		return -EPERM;
2697 
2698 	fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
2699 	if (IS_ERR(fc))
2700 		return PTR_ERR(fc);
2701 
2702 	fc->oldapi = true;
2703 	err = parse_monolithic_mount_data(fc, data);
2704 	if (!err) {
2705 		down_write(&sb->s_umount);
2706 		err = -EPERM;
2707 		if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
2708 			err = reconfigure_super(fc);
2709 			if (!err) {
2710 				lock_mount_hash();
2711 				set_mount_attributes(mnt, mnt_flags);
2712 				unlock_mount_hash();
2713 			}
2714 		}
2715 		up_write(&sb->s_umount);
2716 	}
2717 
2718 	mnt_warn_timestamp_expiry(path, &mnt->mnt);
2719 
2720 	put_fs_context(fc);
2721 	return err;
2722 }
2723 
2724 static inline int tree_contains_unbindable(struct mount *mnt)
2725 {
2726 	struct mount *p;
2727 	for (p = mnt; p; p = next_mnt(p, mnt)) {
2728 		if (IS_MNT_UNBINDABLE(p))
2729 			return 1;
2730 	}
2731 	return 0;
2732 }
2733 
2734 /*
2735  * Check that there aren't references to earlier/same mount namespaces in the
2736  * specified subtree.  Such references can act as pins for mount namespaces
2737  * that aren't checked by the mount-cycle checking code, thereby allowing
2738  * cycles to be made.
2739  */
2740 static bool check_for_nsfs_mounts(struct mount *subtree)
2741 {
2742 	struct mount *p;
2743 	bool ret = false;
2744 
2745 	lock_mount_hash();
2746 	for (p = subtree; p; p = next_mnt(p, subtree))
2747 		if (mnt_ns_loop(p->mnt.mnt_root))
2748 			goto out;
2749 
2750 	ret = true;
2751 out:
2752 	unlock_mount_hash();
2753 	return ret;
2754 }
2755 
2756 static int do_set_group(struct path *from_path, struct path *to_path)
2757 {
2758 	struct mount *from, *to;
2759 	int err;
2760 
2761 	from = real_mount(from_path->mnt);
2762 	to = real_mount(to_path->mnt);
2763 
2764 	namespace_lock();
2765 
2766 	err = -EINVAL;
2767 	/* To and From must be mounted */
2768 	if (!is_mounted(&from->mnt))
2769 		goto out;
2770 	if (!is_mounted(&to->mnt))
2771 		goto out;
2772 
2773 	err = -EPERM;
2774 	/* We should be allowed to modify mount namespaces of both mounts */
2775 	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
2776 		goto out;
2777 	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
2778 		goto out;
2779 
2780 	err = -EINVAL;
2781 	/* To and From paths should be mount roots */
2782 	if (from_path->dentry != from_path->mnt->mnt_root)
2783 		goto out;
2784 	if (to_path->dentry != to_path->mnt->mnt_root)
2785 		goto out;
2786 
2787 	/* Setting sharing groups is only allowed across same superblock */
2788 	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
2789 		goto out;
2790 
2791 	/* From mount root should be wider than To mount root */
2792 	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
2793 		goto out;
2794 
2795 	/* From mount should not have locked children in place of To's root */
2796 	if (has_locked_children(from, to->mnt.mnt_root))
2797 		goto out;
2798 
2799 	/* Setting sharing groups is only allowed on private mounts */
2800 	if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
2801 		goto out;
2802 
2803 	/* From should not be private */
2804 	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
2805 		goto out;
2806 
2807 	if (IS_MNT_SLAVE(from)) {
2808 		struct mount *m = from->mnt_master;
2809 
2810 		list_add(&to->mnt_slave, &m->mnt_slave_list);
2811 		to->mnt_master = m;
2812 	}
2813 
2814 	if (IS_MNT_SHARED(from)) {
2815 		to->mnt_group_id = from->mnt_group_id;
2816 		list_add(&to->mnt_share, &from->mnt_share);
2817 		lock_mount_hash();
2818 		set_mnt_shared(to);
2819 		unlock_mount_hash();
2820 	}
2821 
2822 	err = 0;
2823 out:
2824 	namespace_unlock();
2825 	return err;
2826 }
2827 
2828 static int do_move_mount(struct path *old_path, struct path *new_path)
2829 {
2830 	struct mnt_namespace *ns;
2831 	struct mount *p;
2832 	struct mount *old;
2833 	struct mount *parent;
2834 	struct mountpoint *mp, *old_mp;
2835 	int err;
2836 	bool attached;
2837 
2838 	mp = lock_mount(new_path);
2839 	if (IS_ERR(mp))
2840 		return PTR_ERR(mp);
2841 
2842 	old = real_mount(old_path->mnt);
2843 	p = real_mount(new_path->mnt);
2844 	parent = old->mnt_parent;
2845 	attached = mnt_has_parent(old);
2846 	old_mp = old->mnt_mp;
2847 	ns = old->mnt_ns;
2848 
2849 	err = -EINVAL;
2850 	/* The mountpoint must be in our namespace. */
2851 	if (!check_mnt(p))
2852 		goto out;
2853 
2854 	/* The thing moved must be mounted... */
2855 	if (!is_mounted(&old->mnt))
2856 		goto out;
2857 
2858 	/* ... and either ours or the root of anon namespace */
2859 	if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
2860 		goto out;
2861 
2862 	if (old->mnt.mnt_flags & MNT_LOCKED)
2863 		goto out;
2864 
2865 	if (old_path->dentry != old_path->mnt->mnt_root)
2866 		goto out;
2867 
2868 	if (d_is_dir(new_path->dentry) !=
2869 	    d_is_dir(old_path->dentry))
2870 		goto out;
2871 	/*
2872 	 * Don't move a mount residing in a shared parent.
2873 	 */
2874 	if (attached && IS_MNT_SHARED(parent))
2875 		goto out;
2876 	/*
2877 	 * Don't move a mount tree containing unbindable mounts to a destination
2878 	 * mount which is shared.
2879 	 */
2880 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2881 		goto out;
2882 	err = -ELOOP;
2883 	if (!check_for_nsfs_mounts(old))
2884 		goto out;
2885 	for (; mnt_has_parent(p); p = p->mnt_parent)
2886 		if (p == old)
2887 			goto out;
2888 
2889 	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
2890 				   attached);
2891 	if (err)
2892 		goto out;
2893 
2894 	/* if the mount is moved, it should no longer be expire
2895 	 * automatically */
2896 	list_del_init(&old->mnt_expire);
2897 	if (attached)
2898 		put_mountpoint(old_mp);
2899 out:
2900 	unlock_mount(mp);
2901 	if (!err) {
2902 		if (attached)
2903 			mntput_no_expire(parent);
2904 		else
2905 			free_mnt_ns(ns);
2906 	}
2907 	return err;
2908 }
2909 
2910 static int do_move_mount_old(struct path *path, const char *old_name)
2911 {
2912 	struct path old_path;
2913 	int err;
2914 
2915 	if (!old_name || !*old_name)
2916 		return -EINVAL;
2917 
2918 	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2919 	if (err)
2920 		return err;
2921 
2922 	err = do_move_mount(&old_path, path);
2923 	path_put(&old_path);
2924 	return err;
2925 }
2926 
2927 /*
2928  * add a mount into a namespace's mount tree
2929  */
2930 static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
2931 			const struct path *path, int mnt_flags)
2932 {
2933 	struct mount *parent = real_mount(path->mnt);
2934 
2935 	mnt_flags &= ~MNT_INTERNAL_FLAGS;
2936 
2937 	if (unlikely(!check_mnt(parent))) {
2938 		/* that's acceptable only for automounts done in private ns */
2939 		if (!(mnt_flags & MNT_SHRINKABLE))
2940 			return -EINVAL;
2941 		/* ... and for those we'd better have mountpoint still alive */
2942 		if (!parent->mnt_ns)
2943 			return -EINVAL;
2944 	}
2945 
2946 	/* Refuse the same filesystem on the same mount point */
2947 	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2948 	    path->mnt->mnt_root == path->dentry)
2949 		return -EBUSY;
2950 
2951 	if (d_is_symlink(newmnt->mnt.mnt_root))
2952 		return -EINVAL;
2953 
2954 	newmnt->mnt.mnt_flags = mnt_flags;
2955 	return graft_tree(newmnt, parent, mp);
2956 }
2957 
2958 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
2959 
2960 /*
2961  * Create a new mount using a superblock configuration and request it
2962  * be added to the namespace tree.
2963  */
2964 static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
2965 			   unsigned int mnt_flags)
2966 {
2967 	struct vfsmount *mnt;
2968 	struct mountpoint *mp;
2969 	struct super_block *sb = fc->root->d_sb;
2970 	int error;
2971 
2972 	error = security_sb_kern_mount(sb);
2973 	if (!error && mount_too_revealing(sb, &mnt_flags))
2974 		error = -EPERM;
2975 
2976 	if (unlikely(error)) {
2977 		fc_drop_locked(fc);
2978 		return error;
2979 	}
2980 
2981 	up_write(&sb->s_umount);
2982 
2983 	mnt = vfs_create_mount(fc);
2984 	if (IS_ERR(mnt))
2985 		return PTR_ERR(mnt);
2986 
2987 	mnt_warn_timestamp_expiry(mountpoint, mnt);
2988 
2989 	mp = lock_mount(mountpoint);
2990 	if (IS_ERR(mp)) {
2991 		mntput(mnt);
2992 		return PTR_ERR(mp);
2993 	}
2994 	error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
2995 	unlock_mount(mp);
2996 	if (error < 0)
2997 		mntput(mnt);
2998 	return error;
2999 }
3000 
3001 /*
3002  * create a new mount for userspace and request it to be added into the
3003  * namespace's tree
3004  */
3005 static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
3006 			int mnt_flags, const char *name, void *data)
3007 {
3008 	struct file_system_type *type;
3009 	struct fs_context *fc;
3010 	const char *subtype = NULL;
3011 	int err = 0;
3012 
3013 	if (!fstype)
3014 		return -EINVAL;
3015 
3016 	type = get_fs_type(fstype);
3017 	if (!type)
3018 		return -ENODEV;
3019 
3020 	if (type->fs_flags & FS_HAS_SUBTYPE) {
3021 		subtype = strchr(fstype, '.');
3022 		if (subtype) {
3023 			subtype++;
3024 			if (!*subtype) {
3025 				put_filesystem(type);
3026 				return -EINVAL;
3027 			}
3028 		}
3029 	}
3030 
3031 	fc = fs_context_for_mount(type, sb_flags);
3032 	put_filesystem(type);
3033 	if (IS_ERR(fc))
3034 		return PTR_ERR(fc);
3035 
3036 	if (subtype)
3037 		err = vfs_parse_fs_string(fc, "subtype",
3038 					  subtype, strlen(subtype));
3039 	if (!err && name)
3040 		err = vfs_parse_fs_string(fc, "source", name, strlen(name));
3041 	if (!err)
3042 		err = parse_monolithic_mount_data(fc, data);
3043 	if (!err && !mount_capable(fc))
3044 		err = -EPERM;
3045 	if (!err)
3046 		err = vfs_get_tree(fc);
3047 	if (!err)
3048 		err = do_new_mount_fc(fc, path, mnt_flags);
3049 
3050 	put_fs_context(fc);
3051 	return err;
3052 }
3053 
3054 int finish_automount(struct vfsmount *m, const struct path *path)
3055 {
3056 	struct dentry *dentry = path->dentry;
3057 	struct mountpoint *mp;
3058 	struct mount *mnt;
3059 	int err;
3060 
3061 	if (!m)
3062 		return 0;
3063 	if (IS_ERR(m))
3064 		return PTR_ERR(m);
3065 
3066 	mnt = real_mount(m);
3067 	/* The new mount record should have at least 2 refs to prevent it being
3068 	 * expired before we get a chance to add it
3069 	 */
3070 	BUG_ON(mnt_get_count(mnt) < 2);
3071 
3072 	if (m->mnt_sb == path->mnt->mnt_sb &&
3073 	    m->mnt_root == dentry) {
3074 		err = -ELOOP;
3075 		goto discard;
3076 	}
3077 
3078 	/*
3079 	 * we don't want to use lock_mount() - in this case finding something
3080 	 * that overmounts our mountpoint to be means "quitely drop what we've
3081 	 * got", not "try to mount it on top".
3082 	 */
3083 	inode_lock(dentry->d_inode);
3084 	namespace_lock();
3085 	if (unlikely(cant_mount(dentry))) {
3086 		err = -ENOENT;
3087 		goto discard_locked;
3088 	}
3089 	rcu_read_lock();
3090 	if (unlikely(__lookup_mnt(path->mnt, dentry))) {
3091 		rcu_read_unlock();
3092 		err = 0;
3093 		goto discard_locked;
3094 	}
3095 	rcu_read_unlock();
3096 	mp = get_mountpoint(dentry);
3097 	if (IS_ERR(mp)) {
3098 		err = PTR_ERR(mp);
3099 		goto discard_locked;
3100 	}
3101 
3102 	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
3103 	unlock_mount(mp);
3104 	if (unlikely(err))
3105 		goto discard;
3106 	mntput(m);
3107 	return 0;
3108 
3109 discard_locked:
3110 	namespace_unlock();
3111 	inode_unlock(dentry->d_inode);
3112 discard:
3113 	/* remove m from any expiration list it may be on */
3114 	if (!list_empty(&mnt->mnt_expire)) {
3115 		namespace_lock();
3116 		list_del_init(&mnt->mnt_expire);
3117 		namespace_unlock();
3118 	}
3119 	mntput(m);
3120 	mntput(m);
3121 	return err;
3122 }
3123 
3124 /**
3125  * mnt_set_expiry - Put a mount on an expiration list
3126  * @mnt: The mount to list.
3127  * @expiry_list: The list to add the mount to.
3128  */
3129 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
3130 {
3131 	namespace_lock();
3132 
3133 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
3134 
3135 	namespace_unlock();
3136 }
3137 EXPORT_SYMBOL(mnt_set_expiry);
3138 
3139 /*
3140  * process a list of expirable mountpoints with the intent of discarding any
3141  * mountpoints that aren't in use and haven't been touched since last we came
3142  * here
3143  */
3144 void mark_mounts_for_expiry(struct list_head *mounts)
3145 {
3146 	struct mount *mnt, *next;
3147 	LIST_HEAD(graveyard);
3148 
3149 	if (list_empty(mounts))
3150 		return;
3151 
3152 	namespace_lock();
3153 	lock_mount_hash();
3154 
3155 	/* extract from the expiration list every vfsmount that matches the
3156 	 * following criteria:
3157 	 * - only referenced by its parent vfsmount
3158 	 * - still marked for expiry (marked on the last call here; marks are
3159 	 *   cleared by mntput())
3160 	 */
3161 	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3162 		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
3163 			propagate_mount_busy(mnt, 1))
3164 			continue;
3165 		list_move(&mnt->mnt_expire, &graveyard);
3166 	}
3167 	while (!list_empty(&graveyard)) {
3168 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3169 		touch_mnt_namespace(mnt->mnt_ns);
3170 		umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3171 	}
3172 	unlock_mount_hash();
3173 	namespace_unlock();
3174 }
3175 
3176 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3177 
3178 /*
3179  * Ripoff of 'select_parent()'
3180  *
3181  * search the list of submounts for a given mountpoint, and move any
3182  * shrinkable submounts to the 'graveyard' list.
3183  */
3184 static int select_submounts(struct mount *parent, struct list_head *graveyard)
3185 {
3186 	struct mount *this_parent = parent;
3187 	struct list_head *next;
3188 	int found = 0;
3189 
3190 repeat:
3191 	next = this_parent->mnt_mounts.next;
3192 resume:
3193 	while (next != &this_parent->mnt_mounts) {
3194 		struct list_head *tmp = next;
3195 		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
3196 
3197 		next = tmp->next;
3198 		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3199 			continue;
3200 		/*
3201 		 * Descend a level if the d_mounts list is non-empty.
3202 		 */
3203 		if (!list_empty(&mnt->mnt_mounts)) {
3204 			this_parent = mnt;
3205 			goto repeat;
3206 		}
3207 
3208 		if (!propagate_mount_busy(mnt, 1)) {
3209 			list_move_tail(&mnt->mnt_expire, graveyard);
3210 			found++;
3211 		}
3212 	}
3213 	/*
3214 	 * All done at this level ... ascend and resume the search
3215 	 */
3216 	if (this_parent != parent) {
3217 		next = this_parent->mnt_child.next;
3218 		this_parent = this_parent->mnt_parent;
3219 		goto resume;
3220 	}
3221 	return found;
3222 }
3223 
3224 /*
3225  * process a list of expirable mountpoints with the intent of discarding any
3226  * submounts of a specific parent mountpoint
3227  *
3228  * mount_lock must be held for write
3229  */
3230 static void shrink_submounts(struct mount *mnt)
3231 {
3232 	LIST_HEAD(graveyard);
3233 	struct mount *m;
3234 
3235 	/* extract submounts of 'mountpoint' from the expiration list */
3236 	while (select_submounts(mnt, &graveyard)) {
3237 		while (!list_empty(&graveyard)) {
3238 			m = list_first_entry(&graveyard, struct mount,
3239 						mnt_expire);
3240 			touch_mnt_namespace(m->mnt_ns);
3241 			umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3242 		}
3243 	}
3244 }
3245 
3246 static void *copy_mount_options(const void __user * data)
3247 {
3248 	char *copy;
3249 	unsigned left, offset;
3250 
3251 	if (!data)
3252 		return NULL;
3253 
3254 	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3255 	if (!copy)
3256 		return ERR_PTR(-ENOMEM);
3257 
3258 	left = copy_from_user(copy, data, PAGE_SIZE);
3259 
3260 	/*
3261 	 * Not all architectures have an exact copy_from_user(). Resort to
3262 	 * byte at a time.
3263 	 */
3264 	offset = PAGE_SIZE - left;
3265 	while (left) {
3266 		char c;
3267 		if (get_user(c, (const char __user *)data + offset))
3268 			break;
3269 		copy[offset] = c;
3270 		left--;
3271 		offset++;
3272 	}
3273 
3274 	if (left == PAGE_SIZE) {
3275 		kfree(copy);
3276 		return ERR_PTR(-EFAULT);
3277 	}
3278 
3279 	return copy;
3280 }
3281 
3282 static char *copy_mount_string(const void __user *data)
3283 {
3284 	return data ? strndup_user(data, PATH_MAX) : NULL;
3285 }
3286 
3287 /*
3288  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3289  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3290  *
3291  * data is a (void *) that can point to any structure up to
3292  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3293  * information (or be NULL).
3294  *
3295  * Pre-0.97 versions of mount() didn't have a flags word.
3296  * When the flags word was introduced its top half was required
3297  * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3298  * Therefore, if this magic number is present, it carries no information
3299  * and must be discarded.
3300  */
3301 int path_mount(const char *dev_name, struct path *path,
3302 		const char *type_page, unsigned long flags, void *data_page)
3303 {
3304 	unsigned int mnt_flags = 0, sb_flags;
3305 	int ret;
3306 
3307 	/* Discard magic */
3308 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3309 		flags &= ~MS_MGC_MSK;
3310 
3311 	/* Basic sanity checks */
3312 	if (data_page)
3313 		((char *)data_page)[PAGE_SIZE - 1] = 0;
3314 
3315 	if (flags & MS_NOUSER)
3316 		return -EINVAL;
3317 
3318 	ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
3319 	if (ret)
3320 		return ret;
3321 	if (!may_mount())
3322 		return -EPERM;
3323 	if (flags & SB_MANDLOCK)
3324 		warn_mandlock();
3325 
3326 	/* Default to relatime unless overriden */
3327 	if (!(flags & MS_NOATIME))
3328 		mnt_flags |= MNT_RELATIME;
3329 
3330 	/* Separate the per-mountpoint flags */
3331 	if (flags & MS_NOSUID)
3332 		mnt_flags |= MNT_NOSUID;
3333 	if (flags & MS_NODEV)
3334 		mnt_flags |= MNT_NODEV;
3335 	if (flags & MS_NOEXEC)
3336 		mnt_flags |= MNT_NOEXEC;
3337 	if (flags & MS_NOATIME)
3338 		mnt_flags |= MNT_NOATIME;
3339 	if (flags & MS_NODIRATIME)
3340 		mnt_flags |= MNT_NODIRATIME;
3341 	if (flags & MS_STRICTATIME)
3342 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3343 	if (flags & MS_RDONLY)
3344 		mnt_flags |= MNT_READONLY;
3345 	if (flags & MS_NOSYMFOLLOW)
3346 		mnt_flags |= MNT_NOSYMFOLLOW;
3347 
3348 	/* The default atime for remount is preservation */
3349 	if ((flags & MS_REMOUNT) &&
3350 	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
3351 		       MS_STRICTATIME)) == 0)) {
3352 		mnt_flags &= ~MNT_ATIME_MASK;
3353 		mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
3354 	}
3355 
3356 	sb_flags = flags & (SB_RDONLY |
3357 			    SB_SYNCHRONOUS |
3358 			    SB_MANDLOCK |
3359 			    SB_DIRSYNC |
3360 			    SB_SILENT |
3361 			    SB_POSIXACL |
3362 			    SB_LAZYTIME |
3363 			    SB_I_VERSION);
3364 
3365 	if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
3366 		return do_reconfigure_mnt(path, mnt_flags);
3367 	if (flags & MS_REMOUNT)
3368 		return do_remount(path, flags, sb_flags, mnt_flags, data_page);
3369 	if (flags & MS_BIND)
3370 		return do_loopback(path, dev_name, flags & MS_REC);
3371 	if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3372 		return do_change_type(path, flags);
3373 	if (flags & MS_MOVE)
3374 		return do_move_mount_old(path, dev_name);
3375 
3376 	return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
3377 			    data_page);
3378 }
3379 
3380 long do_mount(const char *dev_name, const char __user *dir_name,
3381 		const char *type_page, unsigned long flags, void *data_page)
3382 {
3383 	struct path path;
3384 	int ret;
3385 
3386 	ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
3387 	if (ret)
3388 		return ret;
3389 	ret = path_mount(dev_name, &path, type_page, flags, data_page);
3390 	path_put(&path);
3391 	return ret;
3392 }
3393 
3394 static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
3395 {
3396 	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
3397 }
3398 
3399 static void dec_mnt_namespaces(struct ucounts *ucounts)
3400 {
3401 	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
3402 }
3403 
3404 static void free_mnt_ns(struct mnt_namespace *ns)
3405 {
3406 	if (!is_anon_ns(ns))
3407 		ns_free_inum(&ns->ns);
3408 	dec_mnt_namespaces(ns->ucounts);
3409 	put_user_ns(ns->user_ns);
3410 	kfree(ns);
3411 }
3412 
3413 /*
3414  * Assign a sequence number so we can detect when we attempt to bind
3415  * mount a reference to an older mount namespace into the current
3416  * mount namespace, preventing reference counting loops.  A 64bit
3417  * number incrementing at 10Ghz will take 12,427 years to wrap which
3418  * is effectively never, so we can ignore the possibility.
3419  */
3420 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
3421 
3422 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
3423 {
3424 	struct mnt_namespace *new_ns;
3425 	struct ucounts *ucounts;
3426 	int ret;
3427 
3428 	ucounts = inc_mnt_namespaces(user_ns);
3429 	if (!ucounts)
3430 		return ERR_PTR(-ENOSPC);
3431 
3432 	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
3433 	if (!new_ns) {
3434 		dec_mnt_namespaces(ucounts);
3435 		return ERR_PTR(-ENOMEM);
3436 	}
3437 	if (!anon) {
3438 		ret = ns_alloc_inum(&new_ns->ns);
3439 		if (ret) {
3440 			kfree(new_ns);
3441 			dec_mnt_namespaces(ucounts);
3442 			return ERR_PTR(ret);
3443 		}
3444 	}
3445 	new_ns->ns.ops = &mntns_operations;
3446 	if (!anon)
3447 		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
3448 	refcount_set(&new_ns->ns.count, 1);
3449 	INIT_LIST_HEAD(&new_ns->list);
3450 	init_waitqueue_head(&new_ns->poll);
3451 	spin_lock_init(&new_ns->ns_lock);
3452 	new_ns->user_ns = get_user_ns(user_ns);
3453 	new_ns->ucounts = ucounts;
3454 	return new_ns;
3455 }
3456 
3457 __latent_entropy
3458 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
3459 		struct user_namespace *user_ns, struct fs_struct *new_fs)
3460 {
3461 	struct mnt_namespace *new_ns;
3462 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
3463 	struct mount *p, *q;
3464 	struct mount *old;
3465 	struct mount *new;
3466 	int copy_flags;
3467 
3468 	BUG_ON(!ns);
3469 
3470 	if (likely(!(flags & CLONE_NEWNS))) {
3471 		get_mnt_ns(ns);
3472 		return ns;
3473 	}
3474 
3475 	old = ns->root;
3476 
3477 	new_ns = alloc_mnt_ns(user_ns, false);
3478 	if (IS_ERR(new_ns))
3479 		return new_ns;
3480 
3481 	namespace_lock();
3482 	/* First pass: copy the tree topology */
3483 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
3484 	if (user_ns != ns->user_ns)
3485 		copy_flags |= CL_SHARED_TO_SLAVE;
3486 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
3487 	if (IS_ERR(new)) {
3488 		namespace_unlock();
3489 		free_mnt_ns(new_ns);
3490 		return ERR_CAST(new);
3491 	}
3492 	if (user_ns != ns->user_ns) {
3493 		lock_mount_hash();
3494 		lock_mnt_tree(new);
3495 		unlock_mount_hash();
3496 	}
3497 	new_ns->root = new;
3498 	list_add_tail(&new_ns->list, &new->mnt_list);
3499 
3500 	/*
3501 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
3502 	 * as belonging to new namespace.  We have already acquired a private
3503 	 * fs_struct, so tsk->fs->lock is not needed.
3504 	 */
3505 	p = old;
3506 	q = new;
3507 	while (p) {
3508 		q->mnt_ns = new_ns;
3509 		new_ns->mounts++;
3510 		if (new_fs) {
3511 			if (&p->mnt == new_fs->root.mnt) {
3512 				new_fs->root.mnt = mntget(&q->mnt);
3513 				rootmnt = &p->mnt;
3514 			}
3515 			if (&p->mnt == new_fs->pwd.mnt) {
3516 				new_fs->pwd.mnt = mntget(&q->mnt);
3517 				pwdmnt = &p->mnt;
3518 			}
3519 		}
3520 		p = next_mnt(p, old);
3521 		q = next_mnt(q, new);
3522 		if (!q)
3523 			break;
3524 		// an mntns binding we'd skipped?
3525 		while (p->mnt.mnt_root != q->mnt.mnt_root)
3526 			p = next_mnt(skip_mnt_tree(p), old);
3527 	}
3528 	namespace_unlock();
3529 
3530 	if (rootmnt)
3531 		mntput(rootmnt);
3532 	if (pwdmnt)
3533 		mntput(pwdmnt);
3534 
3535 	return new_ns;
3536 }
3537 
3538 struct dentry *mount_subtree(struct vfsmount *m, const char *name)
3539 {
3540 	struct mount *mnt = real_mount(m);
3541 	struct mnt_namespace *ns;
3542 	struct super_block *s;
3543 	struct path path;
3544 	int err;
3545 
3546 	ns = alloc_mnt_ns(&init_user_ns, true);
3547 	if (IS_ERR(ns)) {
3548 		mntput(m);
3549 		return ERR_CAST(ns);
3550 	}
3551 	mnt->mnt_ns = ns;
3552 	ns->root = mnt;
3553 	ns->mounts++;
3554 	list_add(&mnt->mnt_list, &ns->list);
3555 
3556 	err = vfs_path_lookup(m->mnt_root, m,
3557 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
3558 
3559 	put_mnt_ns(ns);
3560 
3561 	if (err)
3562 		return ERR_PTR(err);
3563 
3564 	/* trade a vfsmount reference for active sb one */
3565 	s = path.mnt->mnt_sb;
3566 	atomic_inc(&s->s_active);
3567 	mntput(path.mnt);
3568 	/* lock the sucker */
3569 	down_write(&s->s_umount);
3570 	/* ... and return the root of (sub)tree on it */
3571 	return path.dentry;
3572 }
3573 EXPORT_SYMBOL(mount_subtree);
3574 
3575 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
3576 		char __user *, type, unsigned long, flags, void __user *, data)
3577 {
3578 	int ret;
3579 	char *kernel_type;
3580 	char *kernel_dev;
3581 	void *options;
3582 
3583 	kernel_type = copy_mount_string(type);
3584 	ret = PTR_ERR(kernel_type);
3585 	if (IS_ERR(kernel_type))
3586 		goto out_type;
3587 
3588 	kernel_dev = copy_mount_string(dev_name);
3589 	ret = PTR_ERR(kernel_dev);
3590 	if (IS_ERR(kernel_dev))
3591 		goto out_dev;
3592 
3593 	options = copy_mount_options(data);
3594 	ret = PTR_ERR(options);
3595 	if (IS_ERR(options))
3596 		goto out_data;
3597 
3598 	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
3599 
3600 	kfree(options);
3601 out_data:
3602 	kfree(kernel_dev);
3603 out_dev:
3604 	kfree(kernel_type);
3605 out_type:
3606 	return ret;
3607 }
3608 
3609 #define FSMOUNT_VALID_FLAGS                                                    \
3610 	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
3611 	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
3612 	 MOUNT_ATTR_NOSYMFOLLOW)
3613 
3614 #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
3615 
3616 #define MOUNT_SETATTR_PROPAGATION_FLAGS \
3617 	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
3618 
3619 static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
3620 {
3621 	unsigned int mnt_flags = 0;
3622 
3623 	if (attr_flags & MOUNT_ATTR_RDONLY)
3624 		mnt_flags |= MNT_READONLY;
3625 	if (attr_flags & MOUNT_ATTR_NOSUID)
3626 		mnt_flags |= MNT_NOSUID;
3627 	if (attr_flags & MOUNT_ATTR_NODEV)
3628 		mnt_flags |= MNT_NODEV;
3629 	if (attr_flags & MOUNT_ATTR_NOEXEC)
3630 		mnt_flags |= MNT_NOEXEC;
3631 	if (attr_flags & MOUNT_ATTR_NODIRATIME)
3632 		mnt_flags |= MNT_NODIRATIME;
3633 	if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
3634 		mnt_flags |= MNT_NOSYMFOLLOW;
3635 
3636 	return mnt_flags;
3637 }
3638 
3639 /*
3640  * Create a kernel mount representation for a new, prepared superblock
3641  * (specified by fs_fd) and attach to an open_tree-like file descriptor.
3642  */
3643 SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
3644 		unsigned int, attr_flags)
3645 {
3646 	struct mnt_namespace *ns;
3647 	struct fs_context *fc;
3648 	struct file *file;
3649 	struct path newmount;
3650 	struct mount *mnt;
3651 	struct fd f;
3652 	unsigned int mnt_flags = 0;
3653 	long ret;
3654 
3655 	if (!may_mount())
3656 		return -EPERM;
3657 
3658 	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
3659 		return -EINVAL;
3660 
3661 	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
3662 		return -EINVAL;
3663 
3664 	mnt_flags = attr_flags_to_mnt_flags(attr_flags);
3665 
3666 	switch (attr_flags & MOUNT_ATTR__ATIME) {
3667 	case MOUNT_ATTR_STRICTATIME:
3668 		break;
3669 	case MOUNT_ATTR_NOATIME:
3670 		mnt_flags |= MNT_NOATIME;
3671 		break;
3672 	case MOUNT_ATTR_RELATIME:
3673 		mnt_flags |= MNT_RELATIME;
3674 		break;
3675 	default:
3676 		return -EINVAL;
3677 	}
3678 
3679 	f = fdget(fs_fd);
3680 	if (!f.file)
3681 		return -EBADF;
3682 
3683 	ret = -EINVAL;
3684 	if (f.file->f_op != &fscontext_fops)
3685 		goto err_fsfd;
3686 
3687 	fc = f.file->private_data;
3688 
3689 	ret = mutex_lock_interruptible(&fc->uapi_mutex);
3690 	if (ret < 0)
3691 		goto err_fsfd;
3692 
3693 	/* There must be a valid superblock or we can't mount it */
3694 	ret = -EINVAL;
3695 	if (!fc->root)
3696 		goto err_unlock;
3697 
3698 	ret = -EPERM;
3699 	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
3700 		pr_warn("VFS: Mount too revealing\n");
3701 		goto err_unlock;
3702 	}
3703 
3704 	ret = -EBUSY;
3705 	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
3706 		goto err_unlock;
3707 
3708 	if (fc->sb_flags & SB_MANDLOCK)
3709 		warn_mandlock();
3710 
3711 	newmount.mnt = vfs_create_mount(fc);
3712 	if (IS_ERR(newmount.mnt)) {
3713 		ret = PTR_ERR(newmount.mnt);
3714 		goto err_unlock;
3715 	}
3716 	newmount.dentry = dget(fc->root);
3717 	newmount.mnt->mnt_flags = mnt_flags;
3718 
3719 	/* We've done the mount bit - now move the file context into more or
3720 	 * less the same state as if we'd done an fspick().  We don't want to
3721 	 * do any memory allocation or anything like that at this point as we
3722 	 * don't want to have to handle any errors incurred.
3723 	 */
3724 	vfs_clean_context(fc);
3725 
3726 	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
3727 	if (IS_ERR(ns)) {
3728 		ret = PTR_ERR(ns);
3729 		goto err_path;
3730 	}
3731 	mnt = real_mount(newmount.mnt);
3732 	mnt->mnt_ns = ns;
3733 	ns->root = mnt;
3734 	ns->mounts = 1;
3735 	list_add(&mnt->mnt_list, &ns->list);
3736 	mntget(newmount.mnt);
3737 
3738 	/* Attach to an apparent O_PATH fd with a note that we need to unmount
3739 	 * it, not just simply put it.
3740 	 */
3741 	file = dentry_open(&newmount, O_PATH, fc->cred);
3742 	if (IS_ERR(file)) {
3743 		dissolve_on_fput(newmount.mnt);
3744 		ret = PTR_ERR(file);
3745 		goto err_path;
3746 	}
3747 	file->f_mode |= FMODE_NEED_UNMOUNT;
3748 
3749 	ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
3750 	if (ret >= 0)
3751 		fd_install(ret, file);
3752 	else
3753 		fput(file);
3754 
3755 err_path:
3756 	path_put(&newmount);
3757 err_unlock:
3758 	mutex_unlock(&fc->uapi_mutex);
3759 err_fsfd:
3760 	fdput(f);
3761 	return ret;
3762 }
3763 
3764 /*
3765  * Move a mount from one place to another.  In combination with
3766  * fsopen()/fsmount() this is used to install a new mount and in combination
3767  * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
3768  * a mount subtree.
3769  *
3770  * Note the flags value is a combination of MOVE_MOUNT_* flags.
3771  */
3772 SYSCALL_DEFINE5(move_mount,
3773 		int, from_dfd, const char __user *, from_pathname,
3774 		int, to_dfd, const char __user *, to_pathname,
3775 		unsigned int, flags)
3776 {
3777 	struct path from_path, to_path;
3778 	unsigned int lflags;
3779 	int ret = 0;
3780 
3781 	if (!may_mount())
3782 		return -EPERM;
3783 
3784 	if (flags & ~MOVE_MOUNT__MASK)
3785 		return -EINVAL;
3786 
3787 	/* If someone gives a pathname, they aren't permitted to move
3788 	 * from an fd that requires unmount as we can't get at the flag
3789 	 * to clear it afterwards.
3790 	 */
3791 	lflags = 0;
3792 	if (flags & MOVE_MOUNT_F_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
3793 	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
3794 	if (flags & MOVE_MOUNT_F_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
3795 
3796 	ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
3797 	if (ret < 0)
3798 		return ret;
3799 
3800 	lflags = 0;
3801 	if (flags & MOVE_MOUNT_T_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
3802 	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
3803 	if (flags & MOVE_MOUNT_T_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
3804 
3805 	ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
3806 	if (ret < 0)
3807 		goto out_from;
3808 
3809 	ret = security_move_mount(&from_path, &to_path);
3810 	if (ret < 0)
3811 		goto out_to;
3812 
3813 	if (flags & MOVE_MOUNT_SET_GROUP)
3814 		ret = do_set_group(&from_path, &to_path);
3815 	else
3816 		ret = do_move_mount(&from_path, &to_path);
3817 
3818 out_to:
3819 	path_put(&to_path);
3820 out_from:
3821 	path_put(&from_path);
3822 	return ret;
3823 }
3824 
3825 /*
3826  * Return true if path is reachable from root
3827  *
3828  * namespace_sem or mount_lock is held
3829  */
3830 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
3831 			 const struct path *root)
3832 {
3833 	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
3834 		dentry = mnt->mnt_mountpoint;
3835 		mnt = mnt->mnt_parent;
3836 	}
3837 	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
3838 }
3839 
3840 bool path_is_under(const struct path *path1, const struct path *path2)
3841 {
3842 	bool res;
3843 	read_seqlock_excl(&mount_lock);
3844 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
3845 	read_sequnlock_excl(&mount_lock);
3846 	return res;
3847 }
3848 EXPORT_SYMBOL(path_is_under);
3849 
3850 /*
3851  * pivot_root Semantics:
3852  * Moves the root file system of the current process to the directory put_old,
3853  * makes new_root as the new root file system of the current process, and sets
3854  * root/cwd of all processes which had them on the current root to new_root.
3855  *
3856  * Restrictions:
3857  * The new_root and put_old must be directories, and  must not be on the
3858  * same file  system as the current process root. The put_old  must  be
3859  * underneath new_root,  i.e. adding a non-zero number of /.. to the string
3860  * pointed to by put_old must yield the same directory as new_root. No other
3861  * file system may be mounted on put_old. After all, new_root is a mountpoint.
3862  *
3863  * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
3864  * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
3865  * in this situation.
3866  *
3867  * Notes:
3868  *  - we don't move root/cwd if they are not at the root (reason: if something
3869  *    cared enough to change them, it's probably wrong to force them elsewhere)
3870  *  - it's okay to pick a root that isn't the root of a file system, e.g.
3871  *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
3872  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
3873  *    first.
3874  */
3875 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
3876 		const char __user *, put_old)
3877 {
3878 	struct path new, old, root;
3879 	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
3880 	struct mountpoint *old_mp, *root_mp;
3881 	int error;
3882 
3883 	if (!may_mount())
3884 		return -EPERM;
3885 
3886 	error = user_path_at(AT_FDCWD, new_root,
3887 			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
3888 	if (error)
3889 		goto out0;
3890 
3891 	error = user_path_at(AT_FDCWD, put_old,
3892 			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
3893 	if (error)
3894 		goto out1;
3895 
3896 	error = security_sb_pivotroot(&old, &new);
3897 	if (error)
3898 		goto out2;
3899 
3900 	get_fs_root(current->fs, &root);
3901 	old_mp = lock_mount(&old);
3902 	error = PTR_ERR(old_mp);
3903 	if (IS_ERR(old_mp))
3904 		goto out3;
3905 
3906 	error = -EINVAL;
3907 	new_mnt = real_mount(new.mnt);
3908 	root_mnt = real_mount(root.mnt);
3909 	old_mnt = real_mount(old.mnt);
3910 	ex_parent = new_mnt->mnt_parent;
3911 	root_parent = root_mnt->mnt_parent;
3912 	if (IS_MNT_SHARED(old_mnt) ||
3913 		IS_MNT_SHARED(ex_parent) ||
3914 		IS_MNT_SHARED(root_parent))
3915 		goto out4;
3916 	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
3917 		goto out4;
3918 	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
3919 		goto out4;
3920 	error = -ENOENT;
3921 	if (d_unlinked(new.dentry))
3922 		goto out4;
3923 	error = -EBUSY;
3924 	if (new_mnt == root_mnt || old_mnt == root_mnt)
3925 		goto out4; /* loop, on the same file system  */
3926 	error = -EINVAL;
3927 	if (root.mnt->mnt_root != root.dentry)
3928 		goto out4; /* not a mountpoint */
3929 	if (!mnt_has_parent(root_mnt))
3930 		goto out4; /* not attached */
3931 	if (new.mnt->mnt_root != new.dentry)
3932 		goto out4; /* not a mountpoint */
3933 	if (!mnt_has_parent(new_mnt))
3934 		goto out4; /* not attached */
3935 	/* make sure we can reach put_old from new_root */
3936 	if (!is_path_reachable(old_mnt, old.dentry, &new))
3937 		goto out4;
3938 	/* make certain new is below the root */
3939 	if (!is_path_reachable(new_mnt, new.dentry, &root))
3940 		goto out4;
3941 	lock_mount_hash();
3942 	umount_mnt(new_mnt);
3943 	root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
3944 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
3945 		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
3946 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
3947 	}
3948 	/* mount old root on put_old */
3949 	attach_mnt(root_mnt, old_mnt, old_mp);
3950 	/* mount new_root on / */
3951 	attach_mnt(new_mnt, root_parent, root_mp);
3952 	mnt_add_count(root_parent, -1);
3953 	touch_mnt_namespace(current->nsproxy->mnt_ns);
3954 	/* A moved mount should not expire automatically */
3955 	list_del_init(&new_mnt->mnt_expire);
3956 	put_mountpoint(root_mp);
3957 	unlock_mount_hash();
3958 	chroot_fs_refs(&root, &new);
3959 	error = 0;
3960 out4:
3961 	unlock_mount(old_mp);
3962 	if (!error)
3963 		mntput_no_expire(ex_parent);
3964 out3:
3965 	path_put(&root);
3966 out2:
3967 	path_put(&old);
3968 out1:
3969 	path_put(&new);
3970 out0:
3971 	return error;
3972 }
3973 
3974 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
3975 {
3976 	unsigned int flags = mnt->mnt.mnt_flags;
3977 
3978 	/*  flags to clear */
3979 	flags &= ~kattr->attr_clr;
3980 	/* flags to raise */
3981 	flags |= kattr->attr_set;
3982 
3983 	return flags;
3984 }
3985 
3986 static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
3987 {
3988 	struct vfsmount *m = &mnt->mnt;
3989 	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
3990 
3991 	if (!kattr->mnt_idmap)
3992 		return 0;
3993 
3994 	/*
3995 	 * Creating an idmapped mount with the filesystem wide idmapping
3996 	 * doesn't make sense so block that. We don't allow mushy semantics.
3997 	 */
3998 	if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb))
3999 		return -EINVAL;
4000 
4001 	/*
4002 	 * Once a mount has been idmapped we don't allow it to change its
4003 	 * mapping. It makes things simpler and callers can just create
4004 	 * another bind-mount they can idmap if they want to.
4005 	 */
4006 	if (is_idmapped_mnt(m))
4007 		return -EPERM;
4008 
4009 	/* The underlying filesystem doesn't support idmapped mounts yet. */
4010 	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4011 		return -EINVAL;
4012 
4013 	/* We're not controlling the superblock. */
4014 	if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
4015 		return -EPERM;
4016 
4017 	/* Mount has already been visible in the filesystem hierarchy. */
4018 	if (!is_anon_ns(mnt->mnt_ns))
4019 		return -EINVAL;
4020 
4021 	return 0;
4022 }
4023 
4024 /**
4025  * mnt_allow_writers() - check whether the attribute change allows writers
4026  * @kattr: the new mount attributes
4027  * @mnt: the mount to which @kattr will be applied
4028  *
4029  * Check whether thew new mount attributes in @kattr allow concurrent writers.
4030  *
4031  * Return: true if writers need to be held, false if not
4032  */
4033 static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4034 				     const struct mount *mnt)
4035 {
4036 	return (!(kattr->attr_set & MNT_READONLY) ||
4037 		(mnt->mnt.mnt_flags & MNT_READONLY)) &&
4038 	       !kattr->mnt_idmap;
4039 }
4040 
4041 static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
4042 {
4043 	struct mount *m;
4044 	int err;
4045 
4046 	for (m = mnt; m; m = next_mnt(m, mnt)) {
4047 		if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
4048 			err = -EPERM;
4049 			break;
4050 		}
4051 
4052 		err = can_idmap_mount(kattr, m);
4053 		if (err)
4054 			break;
4055 
4056 		if (!mnt_allow_writers(kattr, m)) {
4057 			err = mnt_hold_writers(m);
4058 			if (err)
4059 				break;
4060 		}
4061 
4062 		if (!kattr->recurse)
4063 			return 0;
4064 	}
4065 
4066 	if (err) {
4067 		struct mount *p;
4068 
4069 		/*
4070 		 * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
4071 		 * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
4072 		 * mounts and needs to take care to include the first mount.
4073 		 */
4074 		for (p = mnt; p; p = next_mnt(p, mnt)) {
4075 			/* If we had to hold writers unblock them. */
4076 			if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
4077 				mnt_unhold_writers(p);
4078 
4079 			/*
4080 			 * We're done once the first mount we changed got
4081 			 * MNT_WRITE_HOLD unset.
4082 			 */
4083 			if (p == m)
4084 				break;
4085 		}
4086 	}
4087 	return err;
4088 }
4089 
4090 static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4091 {
4092 	if (!kattr->mnt_idmap)
4093 		return;
4094 
4095 	/*
4096 	 * Pairs with smp_load_acquire() in mnt_idmap().
4097 	 *
4098 	 * Since we only allow a mount to change the idmapping once and
4099 	 * verified this in can_idmap_mount() we know that the mount has
4100 	 * @nop_mnt_idmap attached to it. So there's no need to drop any
4101 	 * references.
4102 	 */
4103 	smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4104 }
4105 
4106 static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
4107 {
4108 	struct mount *m;
4109 
4110 	for (m = mnt; m; m = next_mnt(m, mnt)) {
4111 		unsigned int flags;
4112 
4113 		do_idmap_mount(kattr, m);
4114 		flags = recalc_flags(kattr, m);
4115 		WRITE_ONCE(m->mnt.mnt_flags, flags);
4116 
4117 		/* If we had to hold writers unblock them. */
4118 		if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
4119 			mnt_unhold_writers(m);
4120 
4121 		if (kattr->propagation)
4122 			change_mnt_propagation(m, kattr->propagation);
4123 		if (!kattr->recurse)
4124 			break;
4125 	}
4126 	touch_mnt_namespace(mnt->mnt_ns);
4127 }
4128 
4129 static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
4130 {
4131 	struct mount *mnt = real_mount(path->mnt);
4132 	int err = 0;
4133 
4134 	if (path->dentry != mnt->mnt.mnt_root)
4135 		return -EINVAL;
4136 
4137 	if (kattr->mnt_userns) {
4138 		struct mnt_idmap *mnt_idmap;
4139 
4140 		mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
4141 		if (IS_ERR(mnt_idmap))
4142 			return PTR_ERR(mnt_idmap);
4143 		kattr->mnt_idmap = mnt_idmap;
4144 	}
4145 
4146 	if (kattr->propagation) {
4147 		/*
4148 		 * Only take namespace_lock() if we're actually changing
4149 		 * propagation.
4150 		 */
4151 		namespace_lock();
4152 		if (kattr->propagation == MS_SHARED) {
4153 			err = invent_group_ids(mnt, kattr->recurse);
4154 			if (err) {
4155 				namespace_unlock();
4156 				return err;
4157 			}
4158 		}
4159 	}
4160 
4161 	err = -EINVAL;
4162 	lock_mount_hash();
4163 
4164 	/* Ensure that this isn't anything purely vfs internal. */
4165 	if (!is_mounted(&mnt->mnt))
4166 		goto out;
4167 
4168 	/*
4169 	 * If this is an attached mount make sure it's located in the callers
4170 	 * mount namespace. If it's not don't let the caller interact with it.
4171 	 * If this is a detached mount make sure it has an anonymous mount
4172 	 * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
4173 	 */
4174 	if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
4175 		goto out;
4176 
4177 	/*
4178 	 * First, we get the mount tree in a shape where we can change mount
4179 	 * properties without failure. If we succeeded to do so we commit all
4180 	 * changes and if we failed we clean up.
4181 	 */
4182 	err = mount_setattr_prepare(kattr, mnt);
4183 	if (!err)
4184 		mount_setattr_commit(kattr, mnt);
4185 
4186 out:
4187 	unlock_mount_hash();
4188 
4189 	if (kattr->propagation) {
4190 		if (err)
4191 			cleanup_group_ids(mnt, NULL);
4192 		namespace_unlock();
4193 	}
4194 
4195 	return err;
4196 }
4197 
4198 static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4199 				struct mount_kattr *kattr, unsigned int flags)
4200 {
4201 	int err = 0;
4202 	struct ns_common *ns;
4203 	struct user_namespace *mnt_userns;
4204 	struct fd f;
4205 
4206 	if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
4207 		return 0;
4208 
4209 	/*
4210 	 * We currently do not support clearing an idmapped mount. If this ever
4211 	 * is a use-case we can revisit this but for now let's keep it simple
4212 	 * and not allow it.
4213 	 */
4214 	if (attr->attr_clr & MOUNT_ATTR_IDMAP)
4215 		return -EINVAL;
4216 
4217 	if (attr->userns_fd > INT_MAX)
4218 		return -EINVAL;
4219 
4220 	f = fdget(attr->userns_fd);
4221 	if (!f.file)
4222 		return -EBADF;
4223 
4224 	if (!proc_ns_file(f.file)) {
4225 		err = -EINVAL;
4226 		goto out_fput;
4227 	}
4228 
4229 	ns = get_proc_ns(file_inode(f.file));
4230 	if (ns->ops->type != CLONE_NEWUSER) {
4231 		err = -EINVAL;
4232 		goto out_fput;
4233 	}
4234 
4235 	/*
4236 	 * The initial idmapping cannot be used to create an idmapped
4237 	 * mount. We use the initial idmapping as an indicator of a mount
4238 	 * that is not idmapped. It can simply be passed into helpers that
4239 	 * are aware of idmapped mounts as a convenient shortcut. A user
4240 	 * can just create a dedicated identity mapping to achieve the same
4241 	 * result.
4242 	 */
4243 	mnt_userns = container_of(ns, struct user_namespace, ns);
4244 	if (mnt_userns == &init_user_ns) {
4245 		err = -EPERM;
4246 		goto out_fput;
4247 	}
4248 
4249 	/* We're not controlling the target namespace. */
4250 	if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
4251 		err = -EPERM;
4252 		goto out_fput;
4253 	}
4254 
4255 	kattr->mnt_userns = get_user_ns(mnt_userns);
4256 
4257 out_fput:
4258 	fdput(f);
4259 	return err;
4260 }
4261 
4262 static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4263 			     struct mount_kattr *kattr, unsigned int flags)
4264 {
4265 	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
4266 
4267 	if (flags & AT_NO_AUTOMOUNT)
4268 		lookup_flags &= ~LOOKUP_AUTOMOUNT;
4269 	if (flags & AT_SYMLINK_NOFOLLOW)
4270 		lookup_flags &= ~LOOKUP_FOLLOW;
4271 	if (flags & AT_EMPTY_PATH)
4272 		lookup_flags |= LOOKUP_EMPTY;
4273 
4274 	*kattr = (struct mount_kattr) {
4275 		.lookup_flags	= lookup_flags,
4276 		.recurse	= !!(flags & AT_RECURSIVE),
4277 	};
4278 
4279 	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4280 		return -EINVAL;
4281 	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
4282 		return -EINVAL;
4283 	kattr->propagation = attr->propagation;
4284 
4285 	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4286 		return -EINVAL;
4287 
4288 	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
4289 	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
4290 
4291 	/*
4292 	 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4293 	 * users wanting to transition to a different atime setting cannot
4294 	 * simply specify the atime setting in @attr_set, but must also
4295 	 * specify MOUNT_ATTR__ATIME in the @attr_clr field.
4296 	 * So ensure that MOUNT_ATTR__ATIME can't be partially set in
4297 	 * @attr_clr and that @attr_set can't have any atime bits set if
4298 	 * MOUNT_ATTR__ATIME isn't set in @attr_clr.
4299 	 */
4300 	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4301 		if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4302 			return -EINVAL;
4303 
4304 		/*
4305 		 * Clear all previous time settings as they are mutually
4306 		 * exclusive.
4307 		 */
4308 		kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
4309 		switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4310 		case MOUNT_ATTR_RELATIME:
4311 			kattr->attr_set |= MNT_RELATIME;
4312 			break;
4313 		case MOUNT_ATTR_NOATIME:
4314 			kattr->attr_set |= MNT_NOATIME;
4315 			break;
4316 		case MOUNT_ATTR_STRICTATIME:
4317 			break;
4318 		default:
4319 			return -EINVAL;
4320 		}
4321 	} else {
4322 		if (attr->attr_set & MOUNT_ATTR__ATIME)
4323 			return -EINVAL;
4324 	}
4325 
4326 	return build_mount_idmapped(attr, usize, kattr, flags);
4327 }
4328 
4329 static void finish_mount_kattr(struct mount_kattr *kattr)
4330 {
4331 	put_user_ns(kattr->mnt_userns);
4332 	kattr->mnt_userns = NULL;
4333 
4334 	if (kattr->mnt_idmap)
4335 		mnt_idmap_put(kattr->mnt_idmap);
4336 }
4337 
4338 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4339 		unsigned int, flags, struct mount_attr __user *, uattr,
4340 		size_t, usize)
4341 {
4342 	int err;
4343 	struct path target;
4344 	struct mount_attr attr;
4345 	struct mount_kattr kattr;
4346 
4347 	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4348 
4349 	if (flags & ~(AT_EMPTY_PATH |
4350 		      AT_RECURSIVE |
4351 		      AT_SYMLINK_NOFOLLOW |
4352 		      AT_NO_AUTOMOUNT))
4353 		return -EINVAL;
4354 
4355 	if (unlikely(usize > PAGE_SIZE))
4356 		return -E2BIG;
4357 	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4358 		return -EINVAL;
4359 
4360 	if (!may_mount())
4361 		return -EPERM;
4362 
4363 	err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
4364 	if (err)
4365 		return err;
4366 
4367 	/* Don't bother walking through the mounts if this is a nop. */
4368 	if (attr.attr_set == 0 &&
4369 	    attr.attr_clr == 0 &&
4370 	    attr.propagation == 0)
4371 		return 0;
4372 
4373 	err = build_mount_kattr(&attr, usize, &kattr, flags);
4374 	if (err)
4375 		return err;
4376 
4377 	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
4378 	if (!err) {
4379 		err = do_mount_setattr(&target, &kattr);
4380 		path_put(&target);
4381 	}
4382 	finish_mount_kattr(&kattr);
4383 	return err;
4384 }
4385 
4386 static void __init init_mount_tree(void)
4387 {
4388 	struct vfsmount *mnt;
4389 	struct mount *m;
4390 	struct mnt_namespace *ns;
4391 	struct path root;
4392 
4393 	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
4394 	if (IS_ERR(mnt))
4395 		panic("Can't create rootfs");
4396 
4397 	ns = alloc_mnt_ns(&init_user_ns, false);
4398 	if (IS_ERR(ns))
4399 		panic("Can't allocate initial namespace");
4400 	m = real_mount(mnt);
4401 	m->mnt_ns = ns;
4402 	ns->root = m;
4403 	ns->mounts = 1;
4404 	list_add(&m->mnt_list, &ns->list);
4405 	init_task.nsproxy->mnt_ns = ns;
4406 	get_mnt_ns(ns);
4407 
4408 	root.mnt = mnt;
4409 	root.dentry = mnt->mnt_root;
4410 	mnt->mnt_flags |= MNT_LOCKED;
4411 
4412 	set_fs_pwd(current->fs, &root);
4413 	set_fs_root(current->fs, &root);
4414 }
4415 
4416 void __init mnt_init(void)
4417 {
4418 	int err;
4419 
4420 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
4421 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
4422 
4423 	mount_hashtable = alloc_large_system_hash("Mount-cache",
4424 				sizeof(struct hlist_head),
4425 				mhash_entries, 19,
4426 				HASH_ZERO,
4427 				&m_hash_shift, &m_hash_mask, 0, 0);
4428 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
4429 				sizeof(struct hlist_head),
4430 				mphash_entries, 19,
4431 				HASH_ZERO,
4432 				&mp_hash_shift, &mp_hash_mask, 0, 0);
4433 
4434 	if (!mount_hashtable || !mountpoint_hashtable)
4435 		panic("Failed to allocate mount hash table\n");
4436 
4437 	kernfs_init();
4438 
4439 	err = sysfs_init();
4440 	if (err)
4441 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
4442 			__func__, err);
4443 	fs_kobj = kobject_create_and_add("fs", NULL);
4444 	if (!fs_kobj)
4445 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
4446 	shmem_init();
4447 	init_rootfs();
4448 	init_mount_tree();
4449 }
4450 
4451 void put_mnt_ns(struct mnt_namespace *ns)
4452 {
4453 	if (!refcount_dec_and_test(&ns->ns.count))
4454 		return;
4455 	drop_collected_mounts(&ns->root->mnt);
4456 	free_mnt_ns(ns);
4457 }
4458 
4459 struct vfsmount *kern_mount(struct file_system_type *type)
4460 {
4461 	struct vfsmount *mnt;
4462 	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
4463 	if (!IS_ERR(mnt)) {
4464 		/*
4465 		 * it is a longterm mount, don't release mnt until
4466 		 * we unmount before file sys is unregistered
4467 		*/
4468 		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
4469 	}
4470 	return mnt;
4471 }
4472 EXPORT_SYMBOL_GPL(kern_mount);
4473 
4474 void kern_unmount(struct vfsmount *mnt)
4475 {
4476 	/* release long term mount so mount point can be released */
4477 	if (!IS_ERR(mnt)) {
4478 		mnt_make_shortterm(mnt);
4479 		synchronize_rcu();	/* yecchhh... */
4480 		mntput(mnt);
4481 	}
4482 }
4483 EXPORT_SYMBOL(kern_unmount);
4484 
4485 void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
4486 {
4487 	unsigned int i;
4488 
4489 	for (i = 0; i < num; i++)
4490 		mnt_make_shortterm(mnt[i]);
4491 	synchronize_rcu_expedited();
4492 	for (i = 0; i < num; i++)
4493 		mntput(mnt[i]);
4494 }
4495 EXPORT_SYMBOL(kern_unmount_array);
4496 
4497 bool our_mnt(struct vfsmount *mnt)
4498 {
4499 	return check_mnt(real_mount(mnt));
4500 }
4501 
4502 bool current_chrooted(void)
4503 {
4504 	/* Does the current process have a non-standard root */
4505 	struct path ns_root;
4506 	struct path fs_root;
4507 	bool chrooted;
4508 
4509 	/* Find the namespace root */
4510 	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
4511 	ns_root.dentry = ns_root.mnt->mnt_root;
4512 	path_get(&ns_root);
4513 	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
4514 		;
4515 
4516 	get_fs_root(current->fs, &fs_root);
4517 
4518 	chrooted = !path_equal(&fs_root, &ns_root);
4519 
4520 	path_put(&fs_root);
4521 	path_put(&ns_root);
4522 
4523 	return chrooted;
4524 }
4525 
4526 static bool mnt_already_visible(struct mnt_namespace *ns,
4527 				const struct super_block *sb,
4528 				int *new_mnt_flags)
4529 {
4530 	int new_flags = *new_mnt_flags;
4531 	struct mount *mnt;
4532 	bool visible = false;
4533 
4534 	down_read(&namespace_sem);
4535 	lock_ns_list(ns);
4536 	list_for_each_entry(mnt, &ns->list, mnt_list) {
4537 		struct mount *child;
4538 		int mnt_flags;
4539 
4540 		if (mnt_is_cursor(mnt))
4541 			continue;
4542 
4543 		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
4544 			continue;
4545 
4546 		/* This mount is not fully visible if it's root directory
4547 		 * is not the root directory of the filesystem.
4548 		 */
4549 		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
4550 			continue;
4551 
4552 		/* A local view of the mount flags */
4553 		mnt_flags = mnt->mnt.mnt_flags;
4554 
4555 		/* Don't miss readonly hidden in the superblock flags */
4556 		if (sb_rdonly(mnt->mnt.mnt_sb))
4557 			mnt_flags |= MNT_LOCK_READONLY;
4558 
4559 		/* Verify the mount flags are equal to or more permissive
4560 		 * than the proposed new mount.
4561 		 */
4562 		if ((mnt_flags & MNT_LOCK_READONLY) &&
4563 		    !(new_flags & MNT_READONLY))
4564 			continue;
4565 		if ((mnt_flags & MNT_LOCK_ATIME) &&
4566 		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
4567 			continue;
4568 
4569 		/* This mount is not fully visible if there are any
4570 		 * locked child mounts that cover anything except for
4571 		 * empty directories.
4572 		 */
4573 		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
4574 			struct inode *inode = child->mnt_mountpoint->d_inode;
4575 			/* Only worry about locked mounts */
4576 			if (!(child->mnt.mnt_flags & MNT_LOCKED))
4577 				continue;
4578 			/* Is the directory permanetly empty? */
4579 			if (!is_empty_dir_inode(inode))
4580 				goto next;
4581 		}
4582 		/* Preserve the locked attributes */
4583 		*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
4584 					       MNT_LOCK_ATIME);
4585 		visible = true;
4586 		goto found;
4587 	next:	;
4588 	}
4589 found:
4590 	unlock_ns_list(ns);
4591 	up_read(&namespace_sem);
4592 	return visible;
4593 }
4594 
4595 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
4596 {
4597 	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
4598 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
4599 	unsigned long s_iflags;
4600 
4601 	if (ns->user_ns == &init_user_ns)
4602 		return false;
4603 
4604 	/* Can this filesystem be too revealing? */
4605 	s_iflags = sb->s_iflags;
4606 	if (!(s_iflags & SB_I_USERNS_VISIBLE))
4607 		return false;
4608 
4609 	if ((s_iflags & required_iflags) != required_iflags) {
4610 		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
4611 			  required_iflags);
4612 		return true;
4613 	}
4614 
4615 	return !mnt_already_visible(ns, sb, new_mnt_flags);
4616 }
4617 
4618 bool mnt_may_suid(struct vfsmount *mnt)
4619 {
4620 	/*
4621 	 * Foreign mounts (accessed via fchdir or through /proc
4622 	 * symlinks) are always treated as if they are nosuid.  This
4623 	 * prevents namespaces from trusting potentially unsafe
4624 	 * suid/sgid bits, file caps, or security labels that originate
4625 	 * in other namespaces.
4626 	 */
4627 	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
4628 	       current_in_userns(mnt->mnt_sb->s_user_ns);
4629 }
4630 
4631 static struct ns_common *mntns_get(struct task_struct *task)
4632 {
4633 	struct ns_common *ns = NULL;
4634 	struct nsproxy *nsproxy;
4635 
4636 	task_lock(task);
4637 	nsproxy = task->nsproxy;
4638 	if (nsproxy) {
4639 		ns = &nsproxy->mnt_ns->ns;
4640 		get_mnt_ns(to_mnt_ns(ns));
4641 	}
4642 	task_unlock(task);
4643 
4644 	return ns;
4645 }
4646 
4647 static void mntns_put(struct ns_common *ns)
4648 {
4649 	put_mnt_ns(to_mnt_ns(ns));
4650 }
4651 
4652 static int mntns_install(struct nsset *nsset, struct ns_common *ns)
4653 {
4654 	struct nsproxy *nsproxy = nsset->nsproxy;
4655 	struct fs_struct *fs = nsset->fs;
4656 	struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
4657 	struct user_namespace *user_ns = nsset->cred->user_ns;
4658 	struct path root;
4659 	int err;
4660 
4661 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
4662 	    !ns_capable(user_ns, CAP_SYS_CHROOT) ||
4663 	    !ns_capable(user_ns, CAP_SYS_ADMIN))
4664 		return -EPERM;
4665 
4666 	if (is_anon_ns(mnt_ns))
4667 		return -EINVAL;
4668 
4669 	if (fs->users != 1)
4670 		return -EINVAL;
4671 
4672 	get_mnt_ns(mnt_ns);
4673 	old_mnt_ns = nsproxy->mnt_ns;
4674 	nsproxy->mnt_ns = mnt_ns;
4675 
4676 	/* Find the root */
4677 	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
4678 				"/", LOOKUP_DOWN, &root);
4679 	if (err) {
4680 		/* revert to old namespace */
4681 		nsproxy->mnt_ns = old_mnt_ns;
4682 		put_mnt_ns(mnt_ns);
4683 		return err;
4684 	}
4685 
4686 	put_mnt_ns(old_mnt_ns);
4687 
4688 	/* Update the pwd and root */
4689 	set_fs_pwd(fs, &root);
4690 	set_fs_root(fs, &root);
4691 
4692 	path_put(&root);
4693 	return 0;
4694 }
4695 
4696 static struct user_namespace *mntns_owner(struct ns_common *ns)
4697 {
4698 	return to_mnt_ns(ns)->user_ns;
4699 }
4700 
4701 const struct proc_ns_operations mntns_operations = {
4702 	.name		= "mnt",
4703 	.type		= CLONE_NEWNS,
4704 	.get		= mntns_get,
4705 	.put		= mntns_put,
4706 	.install	= mntns_install,
4707 	.owner		= mntns_owner,
4708 };
4709 
4710 #ifdef CONFIG_SYSCTL
4711 static struct ctl_table fs_namespace_sysctls[] = {
4712 	{
4713 		.procname	= "mount-max",
4714 		.data		= &sysctl_mount_max,
4715 		.maxlen		= sizeof(unsigned int),
4716 		.mode		= 0644,
4717 		.proc_handler	= proc_dointvec_minmax,
4718 		.extra1		= SYSCTL_ONE,
4719 	},
4720 	{ }
4721 };
4722 
4723 static int __init init_fs_namespace_sysctls(void)
4724 {
4725 	register_sysctl_init("fs", fs_namespace_sysctls);
4726 	return 0;
4727 }
4728 fs_initcall(init_fs_namespace_sysctls);
4729 
4730 #endif /* CONFIG_SYSCTL */
4731