xref: /openbmc/linux/fs/exec.c (revision 5f2fb52fac15a8a8e10ce020dd532504a8abfc4e)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/fs/exec.c
4   *
5   *  Copyright (C) 1991, 1992  Linus Torvalds
6   */
7  
8  /*
9   * #!-checking implemented by tytso.
10   */
11  /*
12   * Demand-loading implemented 01.12.91 - no need to read anything but
13   * the header into memory. The inode of the executable is put into
14   * "current->executable", and page faults do the actual loading. Clean.
15   *
16   * Once more I can proudly say that linux stood up to being changed: it
17   * was less than 2 hours work to get demand-loading completely implemented.
18   *
19   * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
20   * current->executable is only used by the procfs.  This allows a dispatch
21   * table to check for several different types  of binary formats.  We keep
22   * trying until we recognize the file or we run out of supported binary
23   * formats.
24   */
25  
26  #include <linux/slab.h>
27  #include <linux/file.h>
28  #include <linux/fdtable.h>
29  #include <linux/mm.h>
30  #include <linux/vmacache.h>
31  #include <linux/stat.h>
32  #include <linux/fcntl.h>
33  #include <linux/swap.h>
34  #include <linux/string.h>
35  #include <linux/init.h>
36  #include <linux/sched/mm.h>
37  #include <linux/sched/coredump.h>
38  #include <linux/sched/signal.h>
39  #include <linux/sched/numa_balancing.h>
40  #include <linux/sched/task.h>
41  #include <linux/pagemap.h>
42  #include <linux/perf_event.h>
43  #include <linux/highmem.h>
44  #include <linux/spinlock.h>
45  #include <linux/key.h>
46  #include <linux/personality.h>
47  #include <linux/binfmts.h>
48  #include <linux/utsname.h>
49  #include <linux/pid_namespace.h>
50  #include <linux/module.h>
51  #include <linux/namei.h>
52  #include <linux/mount.h>
53  #include <linux/security.h>
54  #include <linux/syscalls.h>
55  #include <linux/tsacct_kern.h>
56  #include <linux/cn_proc.h>
57  #include <linux/audit.h>
58  #include <linux/tracehook.h>
59  #include <linux/kmod.h>
60  #include <linux/fsnotify.h>
61  #include <linux/fs_struct.h>
62  #include <linux/oom.h>
63  #include <linux/compat.h>
64  #include <linux/vmalloc.h>
65  
66  #include <linux/uaccess.h>
67  #include <asm/mmu_context.h>
68  #include <asm/tlb.h>
69  
70  #include <trace/events/task.h>
71  #include "internal.h"
72  
73  #include <trace/events/sched.h>
74  
75  int suid_dumpable = 0;
76  
77  static LIST_HEAD(formats);
78  static DEFINE_RWLOCK(binfmt_lock);
79  
80  void __register_binfmt(struct linux_binfmt * fmt, int insert)
81  {
82  	BUG_ON(!fmt);
83  	if (WARN_ON(!fmt->load_binary))
84  		return;
85  	write_lock(&binfmt_lock);
86  	insert ? list_add(&fmt->lh, &formats) :
87  		 list_add_tail(&fmt->lh, &formats);
88  	write_unlock(&binfmt_lock);
89  }
90  
91  EXPORT_SYMBOL(__register_binfmt);
92  
93  void unregister_binfmt(struct linux_binfmt * fmt)
94  {
95  	write_lock(&binfmt_lock);
96  	list_del(&fmt->lh);
97  	write_unlock(&binfmt_lock);
98  }
99  
100  EXPORT_SYMBOL(unregister_binfmt);
101  
102  static inline void put_binfmt(struct linux_binfmt * fmt)
103  {
104  	module_put(fmt->module);
105  }
106  
107  bool path_noexec(const struct path *path)
108  {
109  	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
110  	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
111  }
112  
113  #ifdef CONFIG_USELIB
114  /*
115   * Note that a shared library must be both readable and executable due to
116   * security reasons.
117   *
118   * Also note that we take the address to load from from the file itself.
119   */
120  SYSCALL_DEFINE1(uselib, const char __user *, library)
121  {
122  	struct linux_binfmt *fmt;
123  	struct file *file;
124  	struct filename *tmp = getname(library);
125  	int error = PTR_ERR(tmp);
126  	static const struct open_flags uselib_flags = {
127  		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
128  		.acc_mode = MAY_READ | MAY_EXEC,
129  		.intent = LOOKUP_OPEN,
130  		.lookup_flags = LOOKUP_FOLLOW,
131  	};
132  
133  	if (IS_ERR(tmp))
134  		goto out;
135  
136  	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
137  	putname(tmp);
138  	error = PTR_ERR(file);
139  	if (IS_ERR(file))
140  		goto out;
141  
142  	error = -EINVAL;
143  	if (!S_ISREG(file_inode(file)->i_mode))
144  		goto exit;
145  
146  	error = -EACCES;
147  	if (path_noexec(&file->f_path))
148  		goto exit;
149  
150  	fsnotify_open(file);
151  
152  	error = -ENOEXEC;
153  
154  	read_lock(&binfmt_lock);
155  	list_for_each_entry(fmt, &formats, lh) {
156  		if (!fmt->load_shlib)
157  			continue;
158  		if (!try_module_get(fmt->module))
159  			continue;
160  		read_unlock(&binfmt_lock);
161  		error = fmt->load_shlib(file);
162  		read_lock(&binfmt_lock);
163  		put_binfmt(fmt);
164  		if (error != -ENOEXEC)
165  			break;
166  	}
167  	read_unlock(&binfmt_lock);
168  exit:
169  	fput(file);
170  out:
171    	return error;
172  }
173  #endif /* #ifdef CONFIG_USELIB */
174  
175  #ifdef CONFIG_MMU
176  /*
177   * The nascent bprm->mm is not visible until exec_mmap() but it can
178   * use a lot of memory, account these pages in current->mm temporary
179   * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
180   * change the counter back via acct_arg_size(0).
181   */
182  static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
183  {
184  	struct mm_struct *mm = current->mm;
185  	long diff = (long)(pages - bprm->vma_pages);
186  
187  	if (!mm || !diff)
188  		return;
189  
190  	bprm->vma_pages = pages;
191  	add_mm_counter(mm, MM_ANONPAGES, diff);
192  }
193  
194  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
195  		int write)
196  {
197  	struct page *page;
198  	int ret;
199  	unsigned int gup_flags = FOLL_FORCE;
200  
201  #ifdef CONFIG_STACK_GROWSUP
202  	if (write) {
203  		ret = expand_downwards(bprm->vma, pos);
204  		if (ret < 0)
205  			return NULL;
206  	}
207  #endif
208  
209  	if (write)
210  		gup_flags |= FOLL_WRITE;
211  
212  	/*
213  	 * We are doing an exec().  'current' is the process
214  	 * doing the exec and bprm->mm is the new process's mm.
215  	 */
216  	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
217  			&page, NULL, NULL);
218  	if (ret <= 0)
219  		return NULL;
220  
221  	if (write)
222  		acct_arg_size(bprm, vma_pages(bprm->vma));
223  
224  	return page;
225  }
226  
227  static void put_arg_page(struct page *page)
228  {
229  	put_page(page);
230  }
231  
232  static void free_arg_pages(struct linux_binprm *bprm)
233  {
234  }
235  
236  static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
237  		struct page *page)
238  {
239  	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
240  }
241  
242  static int __bprm_mm_init(struct linux_binprm *bprm)
243  {
244  	int err;
245  	struct vm_area_struct *vma = NULL;
246  	struct mm_struct *mm = bprm->mm;
247  
248  	bprm->vma = vma = vm_area_alloc(mm);
249  	if (!vma)
250  		return -ENOMEM;
251  	vma_set_anonymous(vma);
252  
253  	if (down_write_killable(&mm->mmap_sem)) {
254  		err = -EINTR;
255  		goto err_free;
256  	}
257  
258  	/*
259  	 * Place the stack at the largest stack address the architecture
260  	 * supports. Later, we'll move this to an appropriate place. We don't
261  	 * use STACK_TOP because that can depend on attributes which aren't
262  	 * configured yet.
263  	 */
264  	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
265  	vma->vm_end = STACK_TOP_MAX;
266  	vma->vm_start = vma->vm_end - PAGE_SIZE;
267  	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
268  	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
269  
270  	err = insert_vm_struct(mm, vma);
271  	if (err)
272  		goto err;
273  
274  	mm->stack_vm = mm->total_vm = 1;
275  	up_write(&mm->mmap_sem);
276  	bprm->p = vma->vm_end - sizeof(void *);
277  	return 0;
278  err:
279  	up_write(&mm->mmap_sem);
280  err_free:
281  	bprm->vma = NULL;
282  	vm_area_free(vma);
283  	return err;
284  }
285  
286  static bool valid_arg_len(struct linux_binprm *bprm, long len)
287  {
288  	return len <= MAX_ARG_STRLEN;
289  }
290  
291  #else
292  
293  static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
294  {
295  }
296  
297  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
298  		int write)
299  {
300  	struct page *page;
301  
302  	page = bprm->page[pos / PAGE_SIZE];
303  	if (!page && write) {
304  		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
305  		if (!page)
306  			return NULL;
307  		bprm->page[pos / PAGE_SIZE] = page;
308  	}
309  
310  	return page;
311  }
312  
313  static void put_arg_page(struct page *page)
314  {
315  }
316  
317  static void free_arg_page(struct linux_binprm *bprm, int i)
318  {
319  	if (bprm->page[i]) {
320  		__free_page(bprm->page[i]);
321  		bprm->page[i] = NULL;
322  	}
323  }
324  
325  static void free_arg_pages(struct linux_binprm *bprm)
326  {
327  	int i;
328  
329  	for (i = 0; i < MAX_ARG_PAGES; i++)
330  		free_arg_page(bprm, i);
331  }
332  
333  static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
334  		struct page *page)
335  {
336  }
337  
338  static int __bprm_mm_init(struct linux_binprm *bprm)
339  {
340  	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
341  	return 0;
342  }
343  
344  static bool valid_arg_len(struct linux_binprm *bprm, long len)
345  {
346  	return len <= bprm->p;
347  }
348  
349  #endif /* CONFIG_MMU */
350  
351  /*
352   * Create a new mm_struct and populate it with a temporary stack
353   * vm_area_struct.  We don't have enough context at this point to set the stack
354   * flags, permissions, and offset, so we use temporary values.  We'll update
355   * them later in setup_arg_pages().
356   */
357  static int bprm_mm_init(struct linux_binprm *bprm)
358  {
359  	int err;
360  	struct mm_struct *mm = NULL;
361  
362  	bprm->mm = mm = mm_alloc();
363  	err = -ENOMEM;
364  	if (!mm)
365  		goto err;
366  
367  	/* Save current stack limit for all calculations made during exec. */
368  	task_lock(current->group_leader);
369  	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
370  	task_unlock(current->group_leader);
371  
372  	err = __bprm_mm_init(bprm);
373  	if (err)
374  		goto err;
375  
376  	return 0;
377  
378  err:
379  	if (mm) {
380  		bprm->mm = NULL;
381  		mmdrop(mm);
382  	}
383  
384  	return err;
385  }
386  
387  struct user_arg_ptr {
388  #ifdef CONFIG_COMPAT
389  	bool is_compat;
390  #endif
391  	union {
392  		const char __user *const __user *native;
393  #ifdef CONFIG_COMPAT
394  		const compat_uptr_t __user *compat;
395  #endif
396  	} ptr;
397  };
398  
399  static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
400  {
401  	const char __user *native;
402  
403  #ifdef CONFIG_COMPAT
404  	if (unlikely(argv.is_compat)) {
405  		compat_uptr_t compat;
406  
407  		if (get_user(compat, argv.ptr.compat + nr))
408  			return ERR_PTR(-EFAULT);
409  
410  		return compat_ptr(compat);
411  	}
412  #endif
413  
414  	if (get_user(native, argv.ptr.native + nr))
415  		return ERR_PTR(-EFAULT);
416  
417  	return native;
418  }
419  
420  /*
421   * count() counts the number of strings in array ARGV.
422   */
423  static int count(struct user_arg_ptr argv, int max)
424  {
425  	int i = 0;
426  
427  	if (argv.ptr.native != NULL) {
428  		for (;;) {
429  			const char __user *p = get_user_arg_ptr(argv, i);
430  
431  			if (!p)
432  				break;
433  
434  			if (IS_ERR(p))
435  				return -EFAULT;
436  
437  			if (i >= max)
438  				return -E2BIG;
439  			++i;
440  
441  			if (fatal_signal_pending(current))
442  				return -ERESTARTNOHAND;
443  			cond_resched();
444  		}
445  	}
446  	return i;
447  }
448  
449  static int prepare_arg_pages(struct linux_binprm *bprm,
450  			struct user_arg_ptr argv, struct user_arg_ptr envp)
451  {
452  	unsigned long limit, ptr_size;
453  
454  	bprm->argc = count(argv, MAX_ARG_STRINGS);
455  	if (bprm->argc < 0)
456  		return bprm->argc;
457  
458  	bprm->envc = count(envp, MAX_ARG_STRINGS);
459  	if (bprm->envc < 0)
460  		return bprm->envc;
461  
462  	/*
463  	 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
464  	 * (whichever is smaller) for the argv+env strings.
465  	 * This ensures that:
466  	 *  - the remaining binfmt code will not run out of stack space,
467  	 *  - the program will have a reasonable amount of stack left
468  	 *    to work from.
469  	 */
470  	limit = _STK_LIM / 4 * 3;
471  	limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
472  	/*
473  	 * We've historically supported up to 32 pages (ARG_MAX)
474  	 * of argument strings even with small stacks
475  	 */
476  	limit = max_t(unsigned long, limit, ARG_MAX);
477  	/*
478  	 * We must account for the size of all the argv and envp pointers to
479  	 * the argv and envp strings, since they will also take up space in
480  	 * the stack. They aren't stored until much later when we can't
481  	 * signal to the parent that the child has run out of stack space.
482  	 * Instead, calculate it here so it's possible to fail gracefully.
483  	 */
484  	ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
485  	if (limit <= ptr_size)
486  		return -E2BIG;
487  	limit -= ptr_size;
488  
489  	bprm->argmin = bprm->p - limit;
490  	return 0;
491  }
492  
493  /*
494   * 'copy_strings()' copies argument/environment strings from the old
495   * processes's memory to the new process's stack.  The call to get_user_pages()
496   * ensures the destination page is created and not swapped out.
497   */
498  static int copy_strings(int argc, struct user_arg_ptr argv,
499  			struct linux_binprm *bprm)
500  {
501  	struct page *kmapped_page = NULL;
502  	char *kaddr = NULL;
503  	unsigned long kpos = 0;
504  	int ret;
505  
506  	while (argc-- > 0) {
507  		const char __user *str;
508  		int len;
509  		unsigned long pos;
510  
511  		ret = -EFAULT;
512  		str = get_user_arg_ptr(argv, argc);
513  		if (IS_ERR(str))
514  			goto out;
515  
516  		len = strnlen_user(str, MAX_ARG_STRLEN);
517  		if (!len)
518  			goto out;
519  
520  		ret = -E2BIG;
521  		if (!valid_arg_len(bprm, len))
522  			goto out;
523  
524  		/* We're going to work our way backwords. */
525  		pos = bprm->p;
526  		str += len;
527  		bprm->p -= len;
528  #ifdef CONFIG_MMU
529  		if (bprm->p < bprm->argmin)
530  			goto out;
531  #endif
532  
533  		while (len > 0) {
534  			int offset, bytes_to_copy;
535  
536  			if (fatal_signal_pending(current)) {
537  				ret = -ERESTARTNOHAND;
538  				goto out;
539  			}
540  			cond_resched();
541  
542  			offset = pos % PAGE_SIZE;
543  			if (offset == 0)
544  				offset = PAGE_SIZE;
545  
546  			bytes_to_copy = offset;
547  			if (bytes_to_copy > len)
548  				bytes_to_copy = len;
549  
550  			offset -= bytes_to_copy;
551  			pos -= bytes_to_copy;
552  			str -= bytes_to_copy;
553  			len -= bytes_to_copy;
554  
555  			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
556  				struct page *page;
557  
558  				page = get_arg_page(bprm, pos, 1);
559  				if (!page) {
560  					ret = -E2BIG;
561  					goto out;
562  				}
563  
564  				if (kmapped_page) {
565  					flush_kernel_dcache_page(kmapped_page);
566  					kunmap(kmapped_page);
567  					put_arg_page(kmapped_page);
568  				}
569  				kmapped_page = page;
570  				kaddr = kmap(kmapped_page);
571  				kpos = pos & PAGE_MASK;
572  				flush_arg_page(bprm, kpos, kmapped_page);
573  			}
574  			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
575  				ret = -EFAULT;
576  				goto out;
577  			}
578  		}
579  	}
580  	ret = 0;
581  out:
582  	if (kmapped_page) {
583  		flush_kernel_dcache_page(kmapped_page);
584  		kunmap(kmapped_page);
585  		put_arg_page(kmapped_page);
586  	}
587  	return ret;
588  }
589  
590  /*
591   * Like copy_strings, but get argv and its values from kernel memory.
592   */
593  int copy_strings_kernel(int argc, const char *const *__argv,
594  			struct linux_binprm *bprm)
595  {
596  	int r;
597  	mm_segment_t oldfs = get_fs();
598  	struct user_arg_ptr argv = {
599  		.ptr.native = (const char __user *const  __user *)__argv,
600  	};
601  
602  	set_fs(KERNEL_DS);
603  	r = copy_strings(argc, argv, bprm);
604  	set_fs(oldfs);
605  
606  	return r;
607  }
608  EXPORT_SYMBOL(copy_strings_kernel);
609  
610  #ifdef CONFIG_MMU
611  
612  /*
613   * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
614   * the binfmt code determines where the new stack should reside, we shift it to
615   * its final location.  The process proceeds as follows:
616   *
617   * 1) Use shift to calculate the new vma endpoints.
618   * 2) Extend vma to cover both the old and new ranges.  This ensures the
619   *    arguments passed to subsequent functions are consistent.
620   * 3) Move vma's page tables to the new range.
621   * 4) Free up any cleared pgd range.
622   * 5) Shrink the vma to cover only the new range.
623   */
624  static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
625  {
626  	struct mm_struct *mm = vma->vm_mm;
627  	unsigned long old_start = vma->vm_start;
628  	unsigned long old_end = vma->vm_end;
629  	unsigned long length = old_end - old_start;
630  	unsigned long new_start = old_start - shift;
631  	unsigned long new_end = old_end - shift;
632  	struct mmu_gather tlb;
633  
634  	BUG_ON(new_start > new_end);
635  
636  	/*
637  	 * ensure there are no vmas between where we want to go
638  	 * and where we are
639  	 */
640  	if (vma != find_vma(mm, new_start))
641  		return -EFAULT;
642  
643  	/*
644  	 * cover the whole range: [new_start, old_end)
645  	 */
646  	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
647  		return -ENOMEM;
648  
649  	/*
650  	 * move the page tables downwards, on failure we rely on
651  	 * process cleanup to remove whatever mess we made.
652  	 */
653  	if (length != move_page_tables(vma, old_start,
654  				       vma, new_start, length, false))
655  		return -ENOMEM;
656  
657  	lru_add_drain();
658  	tlb_gather_mmu(&tlb, mm, old_start, old_end);
659  	if (new_end > old_start) {
660  		/*
661  		 * when the old and new regions overlap clear from new_end.
662  		 */
663  		free_pgd_range(&tlb, new_end, old_end, new_end,
664  			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
665  	} else {
666  		/*
667  		 * otherwise, clean from old_start; this is done to not touch
668  		 * the address space in [new_end, old_start) some architectures
669  		 * have constraints on va-space that make this illegal (IA64) -
670  		 * for the others its just a little faster.
671  		 */
672  		free_pgd_range(&tlb, old_start, old_end, new_end,
673  			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
674  	}
675  	tlb_finish_mmu(&tlb, old_start, old_end);
676  
677  	/*
678  	 * Shrink the vma to just the new range.  Always succeeds.
679  	 */
680  	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
681  
682  	return 0;
683  }
684  
685  /*
686   * Finalizes the stack vm_area_struct. The flags and permissions are updated,
687   * the stack is optionally relocated, and some extra space is added.
688   */
689  int setup_arg_pages(struct linux_binprm *bprm,
690  		    unsigned long stack_top,
691  		    int executable_stack)
692  {
693  	unsigned long ret;
694  	unsigned long stack_shift;
695  	struct mm_struct *mm = current->mm;
696  	struct vm_area_struct *vma = bprm->vma;
697  	struct vm_area_struct *prev = NULL;
698  	unsigned long vm_flags;
699  	unsigned long stack_base;
700  	unsigned long stack_size;
701  	unsigned long stack_expand;
702  	unsigned long rlim_stack;
703  
704  #ifdef CONFIG_STACK_GROWSUP
705  	/* Limit stack size */
706  	stack_base = bprm->rlim_stack.rlim_max;
707  	if (stack_base > STACK_SIZE_MAX)
708  		stack_base = STACK_SIZE_MAX;
709  
710  	/* Add space for stack randomization. */
711  	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
712  
713  	/* Make sure we didn't let the argument array grow too large. */
714  	if (vma->vm_end - vma->vm_start > stack_base)
715  		return -ENOMEM;
716  
717  	stack_base = PAGE_ALIGN(stack_top - stack_base);
718  
719  	stack_shift = vma->vm_start - stack_base;
720  	mm->arg_start = bprm->p - stack_shift;
721  	bprm->p = vma->vm_end - stack_shift;
722  #else
723  	stack_top = arch_align_stack(stack_top);
724  	stack_top = PAGE_ALIGN(stack_top);
725  
726  	if (unlikely(stack_top < mmap_min_addr) ||
727  	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
728  		return -ENOMEM;
729  
730  	stack_shift = vma->vm_end - stack_top;
731  
732  	bprm->p -= stack_shift;
733  	mm->arg_start = bprm->p;
734  #endif
735  
736  	if (bprm->loader)
737  		bprm->loader -= stack_shift;
738  	bprm->exec -= stack_shift;
739  
740  	if (down_write_killable(&mm->mmap_sem))
741  		return -EINTR;
742  
743  	vm_flags = VM_STACK_FLAGS;
744  
745  	/*
746  	 * Adjust stack execute permissions; explicitly enable for
747  	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
748  	 * (arch default) otherwise.
749  	 */
750  	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
751  		vm_flags |= VM_EXEC;
752  	else if (executable_stack == EXSTACK_DISABLE_X)
753  		vm_flags &= ~VM_EXEC;
754  	vm_flags |= mm->def_flags;
755  	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
756  
757  	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
758  			vm_flags);
759  	if (ret)
760  		goto out_unlock;
761  	BUG_ON(prev != vma);
762  
763  	if (unlikely(vm_flags & VM_EXEC)) {
764  		pr_warn_once("process '%pD4' started with executable stack\n",
765  			     bprm->file);
766  	}
767  
768  	/* Move stack pages down in memory. */
769  	if (stack_shift) {
770  		ret = shift_arg_pages(vma, stack_shift);
771  		if (ret)
772  			goto out_unlock;
773  	}
774  
775  	/* mprotect_fixup is overkill to remove the temporary stack flags */
776  	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
777  
778  	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
779  	stack_size = vma->vm_end - vma->vm_start;
780  	/*
781  	 * Align this down to a page boundary as expand_stack
782  	 * will align it up.
783  	 */
784  	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
785  #ifdef CONFIG_STACK_GROWSUP
786  	if (stack_size + stack_expand > rlim_stack)
787  		stack_base = vma->vm_start + rlim_stack;
788  	else
789  		stack_base = vma->vm_end + stack_expand;
790  #else
791  	if (stack_size + stack_expand > rlim_stack)
792  		stack_base = vma->vm_end - rlim_stack;
793  	else
794  		stack_base = vma->vm_start - stack_expand;
795  #endif
796  	current->mm->start_stack = bprm->p;
797  	ret = expand_stack(vma, stack_base);
798  	if (ret)
799  		ret = -EFAULT;
800  
801  out_unlock:
802  	up_write(&mm->mmap_sem);
803  	return ret;
804  }
805  EXPORT_SYMBOL(setup_arg_pages);
806  
807  #else
808  
809  /*
810   * Transfer the program arguments and environment from the holding pages
811   * onto the stack. The provided stack pointer is adjusted accordingly.
812   */
813  int transfer_args_to_stack(struct linux_binprm *bprm,
814  			   unsigned long *sp_location)
815  {
816  	unsigned long index, stop, sp;
817  	int ret = 0;
818  
819  	stop = bprm->p >> PAGE_SHIFT;
820  	sp = *sp_location;
821  
822  	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
823  		unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
824  		char *src = kmap(bprm->page[index]) + offset;
825  		sp -= PAGE_SIZE - offset;
826  		if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
827  			ret = -EFAULT;
828  		kunmap(bprm->page[index]);
829  		if (ret)
830  			goto out;
831  	}
832  
833  	*sp_location = sp;
834  
835  out:
836  	return ret;
837  }
838  EXPORT_SYMBOL(transfer_args_to_stack);
839  
840  #endif /* CONFIG_MMU */
841  
842  static struct file *do_open_execat(int fd, struct filename *name, int flags)
843  {
844  	struct file *file;
845  	int err;
846  	struct open_flags open_exec_flags = {
847  		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
848  		.acc_mode = MAY_EXEC,
849  		.intent = LOOKUP_OPEN,
850  		.lookup_flags = LOOKUP_FOLLOW,
851  	};
852  
853  	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
854  		return ERR_PTR(-EINVAL);
855  	if (flags & AT_SYMLINK_NOFOLLOW)
856  		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
857  	if (flags & AT_EMPTY_PATH)
858  		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
859  
860  	file = do_filp_open(fd, name, &open_exec_flags);
861  	if (IS_ERR(file))
862  		goto out;
863  
864  	err = -EACCES;
865  	if (!S_ISREG(file_inode(file)->i_mode))
866  		goto exit;
867  
868  	if (path_noexec(&file->f_path))
869  		goto exit;
870  
871  	err = deny_write_access(file);
872  	if (err)
873  		goto exit;
874  
875  	if (name->name[0] != '\0')
876  		fsnotify_open(file);
877  
878  out:
879  	return file;
880  
881  exit:
882  	fput(file);
883  	return ERR_PTR(err);
884  }
885  
886  struct file *open_exec(const char *name)
887  {
888  	struct filename *filename = getname_kernel(name);
889  	struct file *f = ERR_CAST(filename);
890  
891  	if (!IS_ERR(filename)) {
892  		f = do_open_execat(AT_FDCWD, filename, 0);
893  		putname(filename);
894  	}
895  	return f;
896  }
897  EXPORT_SYMBOL(open_exec);
898  
899  int kernel_read_file(struct file *file, void **buf, loff_t *size,
900  		     loff_t max_size, enum kernel_read_file_id id)
901  {
902  	loff_t i_size, pos;
903  	ssize_t bytes = 0;
904  	int ret;
905  
906  	if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
907  		return -EINVAL;
908  
909  	ret = deny_write_access(file);
910  	if (ret)
911  		return ret;
912  
913  	ret = security_kernel_read_file(file, id);
914  	if (ret)
915  		goto out;
916  
917  	i_size = i_size_read(file_inode(file));
918  	if (i_size <= 0) {
919  		ret = -EINVAL;
920  		goto out;
921  	}
922  	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
923  		ret = -EFBIG;
924  		goto out;
925  	}
926  
927  	if (id != READING_FIRMWARE_PREALLOC_BUFFER)
928  		*buf = vmalloc(i_size);
929  	if (!*buf) {
930  		ret = -ENOMEM;
931  		goto out;
932  	}
933  
934  	pos = 0;
935  	while (pos < i_size) {
936  		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
937  		if (bytes < 0) {
938  			ret = bytes;
939  			goto out_free;
940  		}
941  
942  		if (bytes == 0)
943  			break;
944  	}
945  
946  	if (pos != i_size) {
947  		ret = -EIO;
948  		goto out_free;
949  	}
950  
951  	ret = security_kernel_post_read_file(file, *buf, i_size, id);
952  	if (!ret)
953  		*size = pos;
954  
955  out_free:
956  	if (ret < 0) {
957  		if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
958  			vfree(*buf);
959  			*buf = NULL;
960  		}
961  	}
962  
963  out:
964  	allow_write_access(file);
965  	return ret;
966  }
967  EXPORT_SYMBOL_GPL(kernel_read_file);
968  
969  int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
970  			       loff_t max_size, enum kernel_read_file_id id)
971  {
972  	struct file *file;
973  	int ret;
974  
975  	if (!path || !*path)
976  		return -EINVAL;
977  
978  	file = filp_open(path, O_RDONLY, 0);
979  	if (IS_ERR(file))
980  		return PTR_ERR(file);
981  
982  	ret = kernel_read_file(file, buf, size, max_size, id);
983  	fput(file);
984  	return ret;
985  }
986  EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
987  
988  int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
989  			     enum kernel_read_file_id id)
990  {
991  	struct fd f = fdget(fd);
992  	int ret = -EBADF;
993  
994  	if (!f.file)
995  		goto out;
996  
997  	ret = kernel_read_file(f.file, buf, size, max_size, id);
998  out:
999  	fdput(f);
1000  	return ret;
1001  }
1002  EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
1003  
1004  ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
1005  {
1006  	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
1007  	if (res > 0)
1008  		flush_icache_range(addr, addr + len);
1009  	return res;
1010  }
1011  EXPORT_SYMBOL(read_code);
1012  
1013  static int exec_mmap(struct mm_struct *mm)
1014  {
1015  	struct task_struct *tsk;
1016  	struct mm_struct *old_mm, *active_mm;
1017  
1018  	/* Notify parent that we're no longer interested in the old VM */
1019  	tsk = current;
1020  	old_mm = current->mm;
1021  	exec_mm_release(tsk, old_mm);
1022  
1023  	if (old_mm) {
1024  		sync_mm_rss(old_mm);
1025  		/*
1026  		 * Make sure that if there is a core dump in progress
1027  		 * for the old mm, we get out and die instead of going
1028  		 * through with the exec.  We must hold mmap_sem around
1029  		 * checking core_state and changing tsk->mm.
1030  		 */
1031  		down_read(&old_mm->mmap_sem);
1032  		if (unlikely(old_mm->core_state)) {
1033  			up_read(&old_mm->mmap_sem);
1034  			return -EINTR;
1035  		}
1036  	}
1037  	task_lock(tsk);
1038  	active_mm = tsk->active_mm;
1039  	membarrier_exec_mmap(mm);
1040  	tsk->mm = mm;
1041  	tsk->active_mm = mm;
1042  	activate_mm(active_mm, mm);
1043  	tsk->mm->vmacache_seqnum = 0;
1044  	vmacache_flush(tsk);
1045  	task_unlock(tsk);
1046  	if (old_mm) {
1047  		up_read(&old_mm->mmap_sem);
1048  		BUG_ON(active_mm != old_mm);
1049  		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
1050  		mm_update_next_owner(old_mm);
1051  		mmput(old_mm);
1052  		return 0;
1053  	}
1054  	mmdrop(active_mm);
1055  	return 0;
1056  }
1057  
1058  /*
1059   * This function makes sure the current process has its own signal table,
1060   * so that flush_signal_handlers can later reset the handlers without
1061   * disturbing other processes.  (Other processes might share the signal
1062   * table via the CLONE_SIGHAND option to clone().)
1063   */
1064  static int de_thread(struct task_struct *tsk)
1065  {
1066  	struct signal_struct *sig = tsk->signal;
1067  	struct sighand_struct *oldsighand = tsk->sighand;
1068  	spinlock_t *lock = &oldsighand->siglock;
1069  
1070  	if (thread_group_empty(tsk))
1071  		goto no_thread_group;
1072  
1073  	/*
1074  	 * Kill all other threads in the thread group.
1075  	 */
1076  	spin_lock_irq(lock);
1077  	if (signal_group_exit(sig)) {
1078  		/*
1079  		 * Another group action in progress, just
1080  		 * return so that the signal is processed.
1081  		 */
1082  		spin_unlock_irq(lock);
1083  		return -EAGAIN;
1084  	}
1085  
1086  	sig->group_exit_task = tsk;
1087  	sig->notify_count = zap_other_threads(tsk);
1088  	if (!thread_group_leader(tsk))
1089  		sig->notify_count--;
1090  
1091  	while (sig->notify_count) {
1092  		__set_current_state(TASK_KILLABLE);
1093  		spin_unlock_irq(lock);
1094  		schedule();
1095  		if (__fatal_signal_pending(tsk))
1096  			goto killed;
1097  		spin_lock_irq(lock);
1098  	}
1099  	spin_unlock_irq(lock);
1100  
1101  	/*
1102  	 * At this point all other threads have exited, all we have to
1103  	 * do is to wait for the thread group leader to become inactive,
1104  	 * and to assume its PID:
1105  	 */
1106  	if (!thread_group_leader(tsk)) {
1107  		struct task_struct *leader = tsk->group_leader;
1108  
1109  		for (;;) {
1110  			cgroup_threadgroup_change_begin(tsk);
1111  			write_lock_irq(&tasklist_lock);
1112  			/*
1113  			 * Do this under tasklist_lock to ensure that
1114  			 * exit_notify() can't miss ->group_exit_task
1115  			 */
1116  			sig->notify_count = -1;
1117  			if (likely(leader->exit_state))
1118  				break;
1119  			__set_current_state(TASK_KILLABLE);
1120  			write_unlock_irq(&tasklist_lock);
1121  			cgroup_threadgroup_change_end(tsk);
1122  			schedule();
1123  			if (__fatal_signal_pending(tsk))
1124  				goto killed;
1125  		}
1126  
1127  		/*
1128  		 * The only record we have of the real-time age of a
1129  		 * process, regardless of execs it's done, is start_time.
1130  		 * All the past CPU time is accumulated in signal_struct
1131  		 * from sister threads now dead.  But in this non-leader
1132  		 * exec, nothing survives from the original leader thread,
1133  		 * whose birth marks the true age of this process now.
1134  		 * When we take on its identity by switching to its PID, we
1135  		 * also take its birthdate (always earlier than our own).
1136  		 */
1137  		tsk->start_time = leader->start_time;
1138  		tsk->start_boottime = leader->start_boottime;
1139  
1140  		BUG_ON(!same_thread_group(leader, tsk));
1141  		BUG_ON(has_group_leader_pid(tsk));
1142  		/*
1143  		 * An exec() starts a new thread group with the
1144  		 * TGID of the previous thread group. Rehash the
1145  		 * two threads with a switched PID, and release
1146  		 * the former thread group leader:
1147  		 */
1148  
1149  		/* Become a process group leader with the old leader's pid.
1150  		 * The old leader becomes a thread of the this thread group.
1151  		 * Note: The old leader also uses this pid until release_task
1152  		 *       is called.  Odd but simple and correct.
1153  		 */
1154  		tsk->pid = leader->pid;
1155  		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
1156  		transfer_pid(leader, tsk, PIDTYPE_TGID);
1157  		transfer_pid(leader, tsk, PIDTYPE_PGID);
1158  		transfer_pid(leader, tsk, PIDTYPE_SID);
1159  
1160  		list_replace_rcu(&leader->tasks, &tsk->tasks);
1161  		list_replace_init(&leader->sibling, &tsk->sibling);
1162  
1163  		tsk->group_leader = tsk;
1164  		leader->group_leader = tsk;
1165  
1166  		tsk->exit_signal = SIGCHLD;
1167  		leader->exit_signal = -1;
1168  
1169  		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1170  		leader->exit_state = EXIT_DEAD;
1171  
1172  		/*
1173  		 * We are going to release_task()->ptrace_unlink() silently,
1174  		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1175  		 * the tracer wont't block again waiting for this thread.
1176  		 */
1177  		if (unlikely(leader->ptrace))
1178  			__wake_up_parent(leader, leader->parent);
1179  		write_unlock_irq(&tasklist_lock);
1180  		cgroup_threadgroup_change_end(tsk);
1181  
1182  		release_task(leader);
1183  	}
1184  
1185  	sig->group_exit_task = NULL;
1186  	sig->notify_count = 0;
1187  
1188  no_thread_group:
1189  	/* we have changed execution domain */
1190  	tsk->exit_signal = SIGCHLD;
1191  
1192  #ifdef CONFIG_POSIX_TIMERS
1193  	exit_itimers(sig);
1194  	flush_itimer_signals();
1195  #endif
1196  
1197  	if (refcount_read(&oldsighand->count) != 1) {
1198  		struct sighand_struct *newsighand;
1199  		/*
1200  		 * This ->sighand is shared with the CLONE_SIGHAND
1201  		 * but not CLONE_THREAD task, switch to the new one.
1202  		 */
1203  		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1204  		if (!newsighand)
1205  			return -ENOMEM;
1206  
1207  		refcount_set(&newsighand->count, 1);
1208  		memcpy(newsighand->action, oldsighand->action,
1209  		       sizeof(newsighand->action));
1210  
1211  		write_lock_irq(&tasklist_lock);
1212  		spin_lock(&oldsighand->siglock);
1213  		rcu_assign_pointer(tsk->sighand, newsighand);
1214  		spin_unlock(&oldsighand->siglock);
1215  		write_unlock_irq(&tasklist_lock);
1216  
1217  		__cleanup_sighand(oldsighand);
1218  	}
1219  
1220  	BUG_ON(!thread_group_leader(tsk));
1221  	return 0;
1222  
1223  killed:
1224  	/* protects against exit_notify() and __exit_signal() */
1225  	read_lock(&tasklist_lock);
1226  	sig->group_exit_task = NULL;
1227  	sig->notify_count = 0;
1228  	read_unlock(&tasklist_lock);
1229  	return -EAGAIN;
1230  }
1231  
1232  char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
1233  {
1234  	task_lock(tsk);
1235  	strncpy(buf, tsk->comm, buf_size);
1236  	task_unlock(tsk);
1237  	return buf;
1238  }
1239  EXPORT_SYMBOL_GPL(__get_task_comm);
1240  
1241  /*
1242   * These functions flushes out all traces of the currently running executable
1243   * so that a new one can be started
1244   */
1245  
1246  void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
1247  {
1248  	task_lock(tsk);
1249  	trace_task_rename(tsk, buf);
1250  	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1251  	task_unlock(tsk);
1252  	perf_event_comm(tsk, exec);
1253  }
1254  
1255  /*
1256   * Calling this is the point of no return. None of the failures will be
1257   * seen by userspace since either the process is already taking a fatal
1258   * signal (via de_thread() or coredump), or will have SEGV raised
1259   * (after exec_mmap()) by search_binary_handlers (see below).
1260   */
1261  int flush_old_exec(struct linux_binprm * bprm)
1262  {
1263  	int retval;
1264  
1265  	/*
1266  	 * Make sure we have a private signal table and that
1267  	 * we are unassociated from the previous thread group.
1268  	 */
1269  	retval = de_thread(current);
1270  	if (retval)
1271  		goto out;
1272  
1273  	/*
1274  	 * Must be called _before_ exec_mmap() as bprm->mm is
1275  	 * not visibile until then. This also enables the update
1276  	 * to be lockless.
1277  	 */
1278  	set_mm_exe_file(bprm->mm, bprm->file);
1279  
1280  	/*
1281  	 * Release all of the old mmap stuff
1282  	 */
1283  	acct_arg_size(bprm, 0);
1284  	retval = exec_mmap(bprm->mm);
1285  	if (retval)
1286  		goto out;
1287  
1288  	/*
1289  	 * After clearing bprm->mm (to mark that current is using the
1290  	 * prepared mm now), we have nothing left of the original
1291  	 * process. If anything from here on returns an error, the check
1292  	 * in search_binary_handler() will SEGV current.
1293  	 */
1294  	bprm->mm = NULL;
1295  
1296  	set_fs(USER_DS);
1297  	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1298  					PF_NOFREEZE | PF_NO_SETAFFINITY);
1299  	flush_thread();
1300  	current->personality &= ~bprm->per_clear;
1301  
1302  	/*
1303  	 * We have to apply CLOEXEC before we change whether the process is
1304  	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
1305  	 * trying to access the should-be-closed file descriptors of a process
1306  	 * undergoing exec(2).
1307  	 */
1308  	do_close_on_exec(current->files);
1309  	return 0;
1310  
1311  out:
1312  	return retval;
1313  }
1314  EXPORT_SYMBOL(flush_old_exec);
1315  
1316  void would_dump(struct linux_binprm *bprm, struct file *file)
1317  {
1318  	struct inode *inode = file_inode(file);
1319  	if (inode_permission(inode, MAY_READ) < 0) {
1320  		struct user_namespace *old, *user_ns;
1321  		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1322  
1323  		/* Ensure mm->user_ns contains the executable */
1324  		user_ns = old = bprm->mm->user_ns;
1325  		while ((user_ns != &init_user_ns) &&
1326  		       !privileged_wrt_inode_uidgid(user_ns, inode))
1327  			user_ns = user_ns->parent;
1328  
1329  		if (old != user_ns) {
1330  			bprm->mm->user_ns = get_user_ns(user_ns);
1331  			put_user_ns(old);
1332  		}
1333  	}
1334  }
1335  EXPORT_SYMBOL(would_dump);
1336  
1337  void setup_new_exec(struct linux_binprm * bprm)
1338  {
1339  	/*
1340  	 * Once here, prepare_binrpm() will not be called any more, so
1341  	 * the final state of setuid/setgid/fscaps can be merged into the
1342  	 * secureexec flag.
1343  	 */
1344  	bprm->secureexec |= bprm->cap_elevated;
1345  
1346  	if (bprm->secureexec) {
1347  		/* Make sure parent cannot signal privileged process. */
1348  		current->pdeath_signal = 0;
1349  
1350  		/*
1351  		 * For secureexec, reset the stack limit to sane default to
1352  		 * avoid bad behavior from the prior rlimits. This has to
1353  		 * happen before arch_pick_mmap_layout(), which examines
1354  		 * RLIMIT_STACK, but after the point of no return to avoid
1355  		 * needing to clean up the change on failure.
1356  		 */
1357  		if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1358  			bprm->rlim_stack.rlim_cur = _STK_LIM;
1359  	}
1360  
1361  	arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
1362  
1363  	current->sas_ss_sp = current->sas_ss_size = 0;
1364  
1365  	/*
1366  	 * Figure out dumpability. Note that this checking only of current
1367  	 * is wrong, but userspace depends on it. This should be testing
1368  	 * bprm->secureexec instead.
1369  	 */
1370  	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1371  	    !(uid_eq(current_euid(), current_uid()) &&
1372  	      gid_eq(current_egid(), current_gid())))
1373  		set_dumpable(current->mm, suid_dumpable);
1374  	else
1375  		set_dumpable(current->mm, SUID_DUMP_USER);
1376  
1377  	arch_setup_new_exec();
1378  	perf_event_exec();
1379  	__set_task_comm(current, kbasename(bprm->filename), true);
1380  
1381  	/* Set the new mm task size. We have to do that late because it may
1382  	 * depend on TIF_32BIT which is only updated in flush_thread() on
1383  	 * some architectures like powerpc
1384  	 */
1385  	current->mm->task_size = TASK_SIZE;
1386  
1387  	/* An exec changes our domain. We are no longer part of the thread
1388  	   group */
1389  	current->self_exec_id++;
1390  	flush_signal_handlers(current, 0);
1391  }
1392  EXPORT_SYMBOL(setup_new_exec);
1393  
1394  /* Runs immediately before start_thread() takes over. */
1395  void finalize_exec(struct linux_binprm *bprm)
1396  {
1397  	/* Store any stack rlimit changes before starting thread. */
1398  	task_lock(current->group_leader);
1399  	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1400  	task_unlock(current->group_leader);
1401  }
1402  EXPORT_SYMBOL(finalize_exec);
1403  
1404  /*
1405   * Prepare credentials and lock ->cred_guard_mutex.
1406   * install_exec_creds() commits the new creds and drops the lock.
1407   * Or, if exec fails before, free_bprm() should release ->cred and
1408   * and unlock.
1409   */
1410  static int prepare_bprm_creds(struct linux_binprm *bprm)
1411  {
1412  	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1413  		return -ERESTARTNOINTR;
1414  
1415  	bprm->cred = prepare_exec_creds();
1416  	if (likely(bprm->cred))
1417  		return 0;
1418  
1419  	mutex_unlock(&current->signal->cred_guard_mutex);
1420  	return -ENOMEM;
1421  }
1422  
1423  static void free_bprm(struct linux_binprm *bprm)
1424  {
1425  	free_arg_pages(bprm);
1426  	if (bprm->cred) {
1427  		mutex_unlock(&current->signal->cred_guard_mutex);
1428  		abort_creds(bprm->cred);
1429  	}
1430  	if (bprm->file) {
1431  		allow_write_access(bprm->file);
1432  		fput(bprm->file);
1433  	}
1434  	/* If a binfmt changed the interp, free it. */
1435  	if (bprm->interp != bprm->filename)
1436  		kfree(bprm->interp);
1437  	kfree(bprm);
1438  }
1439  
1440  int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
1441  {
1442  	/* If a binfmt changed the interp, free it first. */
1443  	if (bprm->interp != bprm->filename)
1444  		kfree(bprm->interp);
1445  	bprm->interp = kstrdup(interp, GFP_KERNEL);
1446  	if (!bprm->interp)
1447  		return -ENOMEM;
1448  	return 0;
1449  }
1450  EXPORT_SYMBOL(bprm_change_interp);
1451  
1452  /*
1453   * install the new credentials for this executable
1454   */
1455  void install_exec_creds(struct linux_binprm *bprm)
1456  {
1457  	security_bprm_committing_creds(bprm);
1458  
1459  	commit_creds(bprm->cred);
1460  	bprm->cred = NULL;
1461  
1462  	/*
1463  	 * Disable monitoring for regular users
1464  	 * when executing setuid binaries. Must
1465  	 * wait until new credentials are committed
1466  	 * by commit_creds() above
1467  	 */
1468  	if (get_dumpable(current->mm) != SUID_DUMP_USER)
1469  		perf_event_exit_task(current);
1470  	/*
1471  	 * cred_guard_mutex must be held at least to this point to prevent
1472  	 * ptrace_attach() from altering our determination of the task's
1473  	 * credentials; any time after this it may be unlocked.
1474  	 */
1475  	security_bprm_committed_creds(bprm);
1476  	mutex_unlock(&current->signal->cred_guard_mutex);
1477  }
1478  EXPORT_SYMBOL(install_exec_creds);
1479  
1480  /*
1481   * determine how safe it is to execute the proposed program
1482   * - the caller must hold ->cred_guard_mutex to protect against
1483   *   PTRACE_ATTACH or seccomp thread-sync
1484   */
1485  static void check_unsafe_exec(struct linux_binprm *bprm)
1486  {
1487  	struct task_struct *p = current, *t;
1488  	unsigned n_fs;
1489  
1490  	if (p->ptrace)
1491  		bprm->unsafe |= LSM_UNSAFE_PTRACE;
1492  
1493  	/*
1494  	 * This isn't strictly necessary, but it makes it harder for LSMs to
1495  	 * mess up.
1496  	 */
1497  	if (task_no_new_privs(current))
1498  		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1499  
1500  	t = p;
1501  	n_fs = 1;
1502  	spin_lock(&p->fs->lock);
1503  	rcu_read_lock();
1504  	while_each_thread(p, t) {
1505  		if (t->fs == p->fs)
1506  			n_fs++;
1507  	}
1508  	rcu_read_unlock();
1509  
1510  	if (p->fs->users > n_fs)
1511  		bprm->unsafe |= LSM_UNSAFE_SHARE;
1512  	else
1513  		p->fs->in_exec = 1;
1514  	spin_unlock(&p->fs->lock);
1515  }
1516  
1517  static void bprm_fill_uid(struct linux_binprm *bprm)
1518  {
1519  	struct inode *inode;
1520  	unsigned int mode;
1521  	kuid_t uid;
1522  	kgid_t gid;
1523  
1524  	/*
1525  	 * Since this can be called multiple times (via prepare_binprm),
1526  	 * we must clear any previous work done when setting set[ug]id
1527  	 * bits from any earlier bprm->file uses (for example when run
1528  	 * first for a setuid script then again for its interpreter).
1529  	 */
1530  	bprm->cred->euid = current_euid();
1531  	bprm->cred->egid = current_egid();
1532  
1533  	if (!mnt_may_suid(bprm->file->f_path.mnt))
1534  		return;
1535  
1536  	if (task_no_new_privs(current))
1537  		return;
1538  
1539  	inode = bprm->file->f_path.dentry->d_inode;
1540  	mode = READ_ONCE(inode->i_mode);
1541  	if (!(mode & (S_ISUID|S_ISGID)))
1542  		return;
1543  
1544  	/* Be careful if suid/sgid is set */
1545  	inode_lock(inode);
1546  
1547  	/* reload atomically mode/uid/gid now that lock held */
1548  	mode = inode->i_mode;
1549  	uid = inode->i_uid;
1550  	gid = inode->i_gid;
1551  	inode_unlock(inode);
1552  
1553  	/* We ignore suid/sgid if there are no mappings for them in the ns */
1554  	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
1555  		 !kgid_has_mapping(bprm->cred->user_ns, gid))
1556  		return;
1557  
1558  	if (mode & S_ISUID) {
1559  		bprm->per_clear |= PER_CLEAR_ON_SETID;
1560  		bprm->cred->euid = uid;
1561  	}
1562  
1563  	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1564  		bprm->per_clear |= PER_CLEAR_ON_SETID;
1565  		bprm->cred->egid = gid;
1566  	}
1567  }
1568  
1569  /*
1570   * Fill the binprm structure from the inode.
1571   * Check permissions, then read the first BINPRM_BUF_SIZE bytes
1572   *
1573   * This may be called multiple times for binary chains (scripts for example).
1574   */
1575  int prepare_binprm(struct linux_binprm *bprm)
1576  {
1577  	int retval;
1578  	loff_t pos = 0;
1579  
1580  	bprm_fill_uid(bprm);
1581  
1582  	/* fill in binprm security blob */
1583  	retval = security_bprm_set_creds(bprm);
1584  	if (retval)
1585  		return retval;
1586  	bprm->called_set_creds = 1;
1587  
1588  	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1589  	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1590  }
1591  
1592  EXPORT_SYMBOL(prepare_binprm);
1593  
1594  /*
1595   * Arguments are '\0' separated strings found at the location bprm->p
1596   * points to; chop off the first by relocating brpm->p to right after
1597   * the first '\0' encountered.
1598   */
1599  int remove_arg_zero(struct linux_binprm *bprm)
1600  {
1601  	int ret = 0;
1602  	unsigned long offset;
1603  	char *kaddr;
1604  	struct page *page;
1605  
1606  	if (!bprm->argc)
1607  		return 0;
1608  
1609  	do {
1610  		offset = bprm->p & ~PAGE_MASK;
1611  		page = get_arg_page(bprm, bprm->p, 0);
1612  		if (!page) {
1613  			ret = -EFAULT;
1614  			goto out;
1615  		}
1616  		kaddr = kmap_atomic(page);
1617  
1618  		for (; offset < PAGE_SIZE && kaddr[offset];
1619  				offset++, bprm->p++)
1620  			;
1621  
1622  		kunmap_atomic(kaddr);
1623  		put_arg_page(page);
1624  	} while (offset == PAGE_SIZE);
1625  
1626  	bprm->p++;
1627  	bprm->argc--;
1628  	ret = 0;
1629  
1630  out:
1631  	return ret;
1632  }
1633  EXPORT_SYMBOL(remove_arg_zero);
1634  
1635  #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1636  /*
1637   * cycle the list of binary formats handler, until one recognizes the image
1638   */
1639  int search_binary_handler(struct linux_binprm *bprm)
1640  {
1641  	bool need_retry = IS_ENABLED(CONFIG_MODULES);
1642  	struct linux_binfmt *fmt;
1643  	int retval;
1644  
1645  	/* This allows 4 levels of binfmt rewrites before failing hard. */
1646  	if (bprm->recursion_depth > 5)
1647  		return -ELOOP;
1648  
1649  	retval = security_bprm_check(bprm);
1650  	if (retval)
1651  		return retval;
1652  
1653  	retval = -ENOENT;
1654   retry:
1655  	read_lock(&binfmt_lock);
1656  	list_for_each_entry(fmt, &formats, lh) {
1657  		if (!try_module_get(fmt->module))
1658  			continue;
1659  		read_unlock(&binfmt_lock);
1660  
1661  		bprm->recursion_depth++;
1662  		retval = fmt->load_binary(bprm);
1663  		bprm->recursion_depth--;
1664  
1665  		read_lock(&binfmt_lock);
1666  		put_binfmt(fmt);
1667  		if (retval < 0 && !bprm->mm) {
1668  			/* we got to flush_old_exec() and failed after it */
1669  			read_unlock(&binfmt_lock);
1670  			force_sigsegv(SIGSEGV);
1671  			return retval;
1672  		}
1673  		if (retval != -ENOEXEC || !bprm->file) {
1674  			read_unlock(&binfmt_lock);
1675  			return retval;
1676  		}
1677  	}
1678  	read_unlock(&binfmt_lock);
1679  
1680  	if (need_retry) {
1681  		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1682  		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
1683  			return retval;
1684  		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
1685  			return retval;
1686  		need_retry = false;
1687  		goto retry;
1688  	}
1689  
1690  	return retval;
1691  }
1692  EXPORT_SYMBOL(search_binary_handler);
1693  
1694  static int exec_binprm(struct linux_binprm *bprm)
1695  {
1696  	pid_t old_pid, old_vpid;
1697  	int ret;
1698  
1699  	/* Need to fetch pid before load_binary changes it */
1700  	old_pid = current->pid;
1701  	rcu_read_lock();
1702  	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1703  	rcu_read_unlock();
1704  
1705  	ret = search_binary_handler(bprm);
1706  	if (ret >= 0) {
1707  		audit_bprm(bprm);
1708  		trace_sched_process_exec(current, old_pid, bprm);
1709  		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1710  		proc_exec_connector(current);
1711  	}
1712  
1713  	return ret;
1714  }
1715  
1716  /*
1717   * sys_execve() executes a new program.
1718   */
1719  static int __do_execve_file(int fd, struct filename *filename,
1720  			    struct user_arg_ptr argv,
1721  			    struct user_arg_ptr envp,
1722  			    int flags, struct file *file)
1723  {
1724  	char *pathbuf = NULL;
1725  	struct linux_binprm *bprm;
1726  	struct files_struct *displaced;
1727  	int retval;
1728  
1729  	if (IS_ERR(filename))
1730  		return PTR_ERR(filename);
1731  
1732  	/*
1733  	 * We move the actual failure in case of RLIMIT_NPROC excess from
1734  	 * set*uid() to execve() because too many poorly written programs
1735  	 * don't check setuid() return code.  Here we additionally recheck
1736  	 * whether NPROC limit is still exceeded.
1737  	 */
1738  	if ((current->flags & PF_NPROC_EXCEEDED) &&
1739  	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1740  		retval = -EAGAIN;
1741  		goto out_ret;
1742  	}
1743  
1744  	/* We're below the limit (still or again), so we don't want to make
1745  	 * further execve() calls fail. */
1746  	current->flags &= ~PF_NPROC_EXCEEDED;
1747  
1748  	retval = unshare_files(&displaced);
1749  	if (retval)
1750  		goto out_ret;
1751  
1752  	retval = -ENOMEM;
1753  	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1754  	if (!bprm)
1755  		goto out_files;
1756  
1757  	retval = prepare_bprm_creds(bprm);
1758  	if (retval)
1759  		goto out_free;
1760  
1761  	check_unsafe_exec(bprm);
1762  	current->in_execve = 1;
1763  
1764  	if (!file)
1765  		file = do_open_execat(fd, filename, flags);
1766  	retval = PTR_ERR(file);
1767  	if (IS_ERR(file))
1768  		goto out_unmark;
1769  
1770  	sched_exec();
1771  
1772  	bprm->file = file;
1773  	if (!filename) {
1774  		bprm->filename = "none";
1775  	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
1776  		bprm->filename = filename->name;
1777  	} else {
1778  		if (filename->name[0] == '\0')
1779  			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1780  		else
1781  			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1782  					    fd, filename->name);
1783  		if (!pathbuf) {
1784  			retval = -ENOMEM;
1785  			goto out_unmark;
1786  		}
1787  		/*
1788  		 * Record that a name derived from an O_CLOEXEC fd will be
1789  		 * inaccessible after exec. Relies on having exclusive access to
1790  		 * current->files (due to unshare_files above).
1791  		 */
1792  		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1793  			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1794  		bprm->filename = pathbuf;
1795  	}
1796  	bprm->interp = bprm->filename;
1797  
1798  	retval = bprm_mm_init(bprm);
1799  	if (retval)
1800  		goto out_unmark;
1801  
1802  	retval = prepare_arg_pages(bprm, argv, envp);
1803  	if (retval < 0)
1804  		goto out;
1805  
1806  	retval = prepare_binprm(bprm);
1807  	if (retval < 0)
1808  		goto out;
1809  
1810  	retval = copy_strings_kernel(1, &bprm->filename, bprm);
1811  	if (retval < 0)
1812  		goto out;
1813  
1814  	bprm->exec = bprm->p;
1815  	retval = copy_strings(bprm->envc, envp, bprm);
1816  	if (retval < 0)
1817  		goto out;
1818  
1819  	retval = copy_strings(bprm->argc, argv, bprm);
1820  	if (retval < 0)
1821  		goto out;
1822  
1823  	would_dump(bprm, bprm->file);
1824  
1825  	retval = exec_binprm(bprm);
1826  	if (retval < 0)
1827  		goto out;
1828  
1829  	/* execve succeeded */
1830  	current->fs->in_exec = 0;
1831  	current->in_execve = 0;
1832  	rseq_execve(current);
1833  	acct_update_integrals(current);
1834  	task_numa_free(current, false);
1835  	free_bprm(bprm);
1836  	kfree(pathbuf);
1837  	if (filename)
1838  		putname(filename);
1839  	if (displaced)
1840  		put_files_struct(displaced);
1841  	return retval;
1842  
1843  out:
1844  	if (bprm->mm) {
1845  		acct_arg_size(bprm, 0);
1846  		mmput(bprm->mm);
1847  	}
1848  
1849  out_unmark:
1850  	current->fs->in_exec = 0;
1851  	current->in_execve = 0;
1852  
1853  out_free:
1854  	free_bprm(bprm);
1855  	kfree(pathbuf);
1856  
1857  out_files:
1858  	if (displaced)
1859  		reset_files_struct(displaced);
1860  out_ret:
1861  	if (filename)
1862  		putname(filename);
1863  	return retval;
1864  }
1865  
1866  static int do_execveat_common(int fd, struct filename *filename,
1867  			      struct user_arg_ptr argv,
1868  			      struct user_arg_ptr envp,
1869  			      int flags)
1870  {
1871  	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
1872  }
1873  
1874  int do_execve_file(struct file *file, void *__argv, void *__envp)
1875  {
1876  	struct user_arg_ptr argv = { .ptr.native = __argv };
1877  	struct user_arg_ptr envp = { .ptr.native = __envp };
1878  
1879  	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
1880  }
1881  
1882  int do_execve(struct filename *filename,
1883  	const char __user *const __user *__argv,
1884  	const char __user *const __user *__envp)
1885  {
1886  	struct user_arg_ptr argv = { .ptr.native = __argv };
1887  	struct user_arg_ptr envp = { .ptr.native = __envp };
1888  	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1889  }
1890  
1891  int do_execveat(int fd, struct filename *filename,
1892  		const char __user *const __user *__argv,
1893  		const char __user *const __user *__envp,
1894  		int flags)
1895  {
1896  	struct user_arg_ptr argv = { .ptr.native = __argv };
1897  	struct user_arg_ptr envp = { .ptr.native = __envp };
1898  
1899  	return do_execveat_common(fd, filename, argv, envp, flags);
1900  }
1901  
1902  #ifdef CONFIG_COMPAT
1903  static int compat_do_execve(struct filename *filename,
1904  	const compat_uptr_t __user *__argv,
1905  	const compat_uptr_t __user *__envp)
1906  {
1907  	struct user_arg_ptr argv = {
1908  		.is_compat = true,
1909  		.ptr.compat = __argv,
1910  	};
1911  	struct user_arg_ptr envp = {
1912  		.is_compat = true,
1913  		.ptr.compat = __envp,
1914  	};
1915  	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1916  }
1917  
1918  static int compat_do_execveat(int fd, struct filename *filename,
1919  			      const compat_uptr_t __user *__argv,
1920  			      const compat_uptr_t __user *__envp,
1921  			      int flags)
1922  {
1923  	struct user_arg_ptr argv = {
1924  		.is_compat = true,
1925  		.ptr.compat = __argv,
1926  	};
1927  	struct user_arg_ptr envp = {
1928  		.is_compat = true,
1929  		.ptr.compat = __envp,
1930  	};
1931  	return do_execveat_common(fd, filename, argv, envp, flags);
1932  }
1933  #endif
1934  
1935  void set_binfmt(struct linux_binfmt *new)
1936  {
1937  	struct mm_struct *mm = current->mm;
1938  
1939  	if (mm->binfmt)
1940  		module_put(mm->binfmt->module);
1941  
1942  	mm->binfmt = new;
1943  	if (new)
1944  		__module_get(new->module);
1945  }
1946  EXPORT_SYMBOL(set_binfmt);
1947  
1948  /*
1949   * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1950   */
1951  void set_dumpable(struct mm_struct *mm, int value)
1952  {
1953  	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1954  		return;
1955  
1956  	set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
1957  }
1958  
1959  SYSCALL_DEFINE3(execve,
1960  		const char __user *, filename,
1961  		const char __user *const __user *, argv,
1962  		const char __user *const __user *, envp)
1963  {
1964  	return do_execve(getname(filename), argv, envp);
1965  }
1966  
1967  SYSCALL_DEFINE5(execveat,
1968  		int, fd, const char __user *, filename,
1969  		const char __user *const __user *, argv,
1970  		const char __user *const __user *, envp,
1971  		int, flags)
1972  {
1973  	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1974  
1975  	return do_execveat(fd,
1976  			   getname_flags(filename, lookup_flags, NULL),
1977  			   argv, envp, flags);
1978  }
1979  
1980  #ifdef CONFIG_COMPAT
1981  COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1982  	const compat_uptr_t __user *, argv,
1983  	const compat_uptr_t __user *, envp)
1984  {
1985  	return compat_do_execve(getname(filename), argv, envp);
1986  }
1987  
1988  COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
1989  		       const char __user *, filename,
1990  		       const compat_uptr_t __user *, argv,
1991  		       const compat_uptr_t __user *, envp,
1992  		       int,  flags)
1993  {
1994  	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1995  
1996  	return compat_do_execveat(fd,
1997  				  getname_flags(filename, lookup_flags, NULL),
1998  				  argv, envp, flags);
1999  }
2000  #endif
2001