xref: /openbmc/linux/fs/exec.c (revision ec8f24b7faaf3d4799a7c3f4c1b87f6b02778ad1)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/fs/exec.c
4   *
5   *  Copyright (C) 1991, 1992  Linus Torvalds
6   */
7  
8  /*
9   * #!-checking implemented by tytso.
10   */
11  /*
12   * Demand-loading implemented 01.12.91 - no need to read anything but
13   * the header into memory. The inode of the executable is put into
14   * "current->executable", and page faults do the actual loading. Clean.
15   *
16   * Once more I can proudly say that linux stood up to being changed: it
17   * was less than 2 hours work to get demand-loading completely implemented.
18   *
19   * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
20   * current->executable is only used by the procfs.  This allows a dispatch
21   * table to check for several different types  of binary formats.  We keep
22   * trying until we recognize the file or we run out of supported binary
23   * formats.
24   */
25  
26  #include <linux/slab.h>
27  #include <linux/file.h>
28  #include <linux/fdtable.h>
29  #include <linux/mm.h>
30  #include <linux/vmacache.h>
31  #include <linux/stat.h>
32  #include <linux/fcntl.h>
33  #include <linux/swap.h>
34  #include <linux/string.h>
35  #include <linux/init.h>
36  #include <linux/sched/mm.h>
37  #include <linux/sched/coredump.h>
38  #include <linux/sched/signal.h>
39  #include <linux/sched/numa_balancing.h>
40  #include <linux/sched/task.h>
41  #include <linux/pagemap.h>
42  #include <linux/perf_event.h>
43  #include <linux/highmem.h>
44  #include <linux/spinlock.h>
45  #include <linux/key.h>
46  #include <linux/personality.h>
47  #include <linux/binfmts.h>
48  #include <linux/utsname.h>
49  #include <linux/pid_namespace.h>
50  #include <linux/module.h>
51  #include <linux/namei.h>
52  #include <linux/mount.h>
53  #include <linux/security.h>
54  #include <linux/syscalls.h>
55  #include <linux/tsacct_kern.h>
56  #include <linux/cn_proc.h>
57  #include <linux/audit.h>
58  #include <linux/tracehook.h>
59  #include <linux/kmod.h>
60  #include <linux/fsnotify.h>
61  #include <linux/fs_struct.h>
62  #include <linux/pipe_fs_i.h>
63  #include <linux/oom.h>
64  #include <linux/compat.h>
65  #include <linux/vmalloc.h>
66  
67  #include <linux/uaccess.h>
68  #include <asm/mmu_context.h>
69  #include <asm/tlb.h>
70  
71  #include <trace/events/task.h>
72  #include "internal.h"
73  
74  #include <trace/events/sched.h>
75  
76  int suid_dumpable = 0;
77  
78  static LIST_HEAD(formats);
79  static DEFINE_RWLOCK(binfmt_lock);
80  
81  void __register_binfmt(struct linux_binfmt * fmt, int insert)
82  {
83  	BUG_ON(!fmt);
84  	if (WARN_ON(!fmt->load_binary))
85  		return;
86  	write_lock(&binfmt_lock);
87  	insert ? list_add(&fmt->lh, &formats) :
88  		 list_add_tail(&fmt->lh, &formats);
89  	write_unlock(&binfmt_lock);
90  }
91  
92  EXPORT_SYMBOL(__register_binfmt);
93  
94  void unregister_binfmt(struct linux_binfmt * fmt)
95  {
96  	write_lock(&binfmt_lock);
97  	list_del(&fmt->lh);
98  	write_unlock(&binfmt_lock);
99  }
100  
101  EXPORT_SYMBOL(unregister_binfmt);
102  
103  static inline void put_binfmt(struct linux_binfmt * fmt)
104  {
105  	module_put(fmt->module);
106  }
107  
108  bool path_noexec(const struct path *path)
109  {
110  	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
111  	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
112  }
113  
114  #ifdef CONFIG_USELIB
115  /*
116   * Note that a shared library must be both readable and executable due to
117   * security reasons.
118   *
119   * Also note that we take the address to load from from the file itself.
120   */
121  SYSCALL_DEFINE1(uselib, const char __user *, library)
122  {
123  	struct linux_binfmt *fmt;
124  	struct file *file;
125  	struct filename *tmp = getname(library);
126  	int error = PTR_ERR(tmp);
127  	static const struct open_flags uselib_flags = {
128  		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
129  		.acc_mode = MAY_READ | MAY_EXEC,
130  		.intent = LOOKUP_OPEN,
131  		.lookup_flags = LOOKUP_FOLLOW,
132  	};
133  
134  	if (IS_ERR(tmp))
135  		goto out;
136  
137  	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
138  	putname(tmp);
139  	error = PTR_ERR(file);
140  	if (IS_ERR(file))
141  		goto out;
142  
143  	error = -EINVAL;
144  	if (!S_ISREG(file_inode(file)->i_mode))
145  		goto exit;
146  
147  	error = -EACCES;
148  	if (path_noexec(&file->f_path))
149  		goto exit;
150  
151  	fsnotify_open(file);
152  
153  	error = -ENOEXEC;
154  
155  	read_lock(&binfmt_lock);
156  	list_for_each_entry(fmt, &formats, lh) {
157  		if (!fmt->load_shlib)
158  			continue;
159  		if (!try_module_get(fmt->module))
160  			continue;
161  		read_unlock(&binfmt_lock);
162  		error = fmt->load_shlib(file);
163  		read_lock(&binfmt_lock);
164  		put_binfmt(fmt);
165  		if (error != -ENOEXEC)
166  			break;
167  	}
168  	read_unlock(&binfmt_lock);
169  exit:
170  	fput(file);
171  out:
172    	return error;
173  }
174  #endif /* #ifdef CONFIG_USELIB */
175  
176  #ifdef CONFIG_MMU
177  /*
178   * The nascent bprm->mm is not visible until exec_mmap() but it can
179   * use a lot of memory, account these pages in current->mm temporary
180   * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
181   * change the counter back via acct_arg_size(0).
182   */
183  static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
184  {
185  	struct mm_struct *mm = current->mm;
186  	long diff = (long)(pages - bprm->vma_pages);
187  
188  	if (!mm || !diff)
189  		return;
190  
191  	bprm->vma_pages = pages;
192  	add_mm_counter(mm, MM_ANONPAGES, diff);
193  }
194  
195  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
196  		int write)
197  {
198  	struct page *page;
199  	int ret;
200  	unsigned int gup_flags = FOLL_FORCE;
201  
202  #ifdef CONFIG_STACK_GROWSUP
203  	if (write) {
204  		ret = expand_downwards(bprm->vma, pos);
205  		if (ret < 0)
206  			return NULL;
207  	}
208  #endif
209  
210  	if (write)
211  		gup_flags |= FOLL_WRITE;
212  
213  	/*
214  	 * We are doing an exec().  'current' is the process
215  	 * doing the exec and bprm->mm is the new process's mm.
216  	 */
217  	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
218  			&page, NULL, NULL);
219  	if (ret <= 0)
220  		return NULL;
221  
222  	if (write)
223  		acct_arg_size(bprm, vma_pages(bprm->vma));
224  
225  	return page;
226  }
227  
228  static void put_arg_page(struct page *page)
229  {
230  	put_page(page);
231  }
232  
233  static void free_arg_pages(struct linux_binprm *bprm)
234  {
235  }
236  
237  static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
238  		struct page *page)
239  {
240  	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
241  }
242  
243  static int __bprm_mm_init(struct linux_binprm *bprm)
244  {
245  	int err;
246  	struct vm_area_struct *vma = NULL;
247  	struct mm_struct *mm = bprm->mm;
248  
249  	bprm->vma = vma = vm_area_alloc(mm);
250  	if (!vma)
251  		return -ENOMEM;
252  	vma_set_anonymous(vma);
253  
254  	if (down_write_killable(&mm->mmap_sem)) {
255  		err = -EINTR;
256  		goto err_free;
257  	}
258  
259  	/*
260  	 * Place the stack at the largest stack address the architecture
261  	 * supports. Later, we'll move this to an appropriate place. We don't
262  	 * use STACK_TOP because that can depend on attributes which aren't
263  	 * configured yet.
264  	 */
265  	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
266  	vma->vm_end = STACK_TOP_MAX;
267  	vma->vm_start = vma->vm_end - PAGE_SIZE;
268  	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
269  	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
270  
271  	err = insert_vm_struct(mm, vma);
272  	if (err)
273  		goto err;
274  
275  	mm->stack_vm = mm->total_vm = 1;
276  	arch_bprm_mm_init(mm, vma);
277  	up_write(&mm->mmap_sem);
278  	bprm->p = vma->vm_end - sizeof(void *);
279  	return 0;
280  err:
281  	up_write(&mm->mmap_sem);
282  err_free:
283  	bprm->vma = NULL;
284  	vm_area_free(vma);
285  	return err;
286  }
287  
288  static bool valid_arg_len(struct linux_binprm *bprm, long len)
289  {
290  	return len <= MAX_ARG_STRLEN;
291  }
292  
293  #else
294  
295  static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
296  {
297  }
298  
299  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
300  		int write)
301  {
302  	struct page *page;
303  
304  	page = bprm->page[pos / PAGE_SIZE];
305  	if (!page && write) {
306  		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
307  		if (!page)
308  			return NULL;
309  		bprm->page[pos / PAGE_SIZE] = page;
310  	}
311  
312  	return page;
313  }
314  
315  static void put_arg_page(struct page *page)
316  {
317  }
318  
319  static void free_arg_page(struct linux_binprm *bprm, int i)
320  {
321  	if (bprm->page[i]) {
322  		__free_page(bprm->page[i]);
323  		bprm->page[i] = NULL;
324  	}
325  }
326  
327  static void free_arg_pages(struct linux_binprm *bprm)
328  {
329  	int i;
330  
331  	for (i = 0; i < MAX_ARG_PAGES; i++)
332  		free_arg_page(bprm, i);
333  }
334  
335  static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
336  		struct page *page)
337  {
338  }
339  
340  static int __bprm_mm_init(struct linux_binprm *bprm)
341  {
342  	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
343  	return 0;
344  }
345  
346  static bool valid_arg_len(struct linux_binprm *bprm, long len)
347  {
348  	return len <= bprm->p;
349  }
350  
351  #endif /* CONFIG_MMU */
352  
353  /*
354   * Create a new mm_struct and populate it with a temporary stack
355   * vm_area_struct.  We don't have enough context at this point to set the stack
356   * flags, permissions, and offset, so we use temporary values.  We'll update
357   * them later in setup_arg_pages().
358   */
359  static int bprm_mm_init(struct linux_binprm *bprm)
360  {
361  	int err;
362  	struct mm_struct *mm = NULL;
363  
364  	bprm->mm = mm = mm_alloc();
365  	err = -ENOMEM;
366  	if (!mm)
367  		goto err;
368  
369  	/* Save current stack limit for all calculations made during exec. */
370  	task_lock(current->group_leader);
371  	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
372  	task_unlock(current->group_leader);
373  
374  	err = __bprm_mm_init(bprm);
375  	if (err)
376  		goto err;
377  
378  	return 0;
379  
380  err:
381  	if (mm) {
382  		bprm->mm = NULL;
383  		mmdrop(mm);
384  	}
385  
386  	return err;
387  }
388  
389  struct user_arg_ptr {
390  #ifdef CONFIG_COMPAT
391  	bool is_compat;
392  #endif
393  	union {
394  		const char __user *const __user *native;
395  #ifdef CONFIG_COMPAT
396  		const compat_uptr_t __user *compat;
397  #endif
398  	} ptr;
399  };
400  
401  static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
402  {
403  	const char __user *native;
404  
405  #ifdef CONFIG_COMPAT
406  	if (unlikely(argv.is_compat)) {
407  		compat_uptr_t compat;
408  
409  		if (get_user(compat, argv.ptr.compat + nr))
410  			return ERR_PTR(-EFAULT);
411  
412  		return compat_ptr(compat);
413  	}
414  #endif
415  
416  	if (get_user(native, argv.ptr.native + nr))
417  		return ERR_PTR(-EFAULT);
418  
419  	return native;
420  }
421  
422  /*
423   * count() counts the number of strings in array ARGV.
424   */
425  static int count(struct user_arg_ptr argv, int max)
426  {
427  	int i = 0;
428  
429  	if (argv.ptr.native != NULL) {
430  		for (;;) {
431  			const char __user *p = get_user_arg_ptr(argv, i);
432  
433  			if (!p)
434  				break;
435  
436  			if (IS_ERR(p))
437  				return -EFAULT;
438  
439  			if (i >= max)
440  				return -E2BIG;
441  			++i;
442  
443  			if (fatal_signal_pending(current))
444  				return -ERESTARTNOHAND;
445  			cond_resched();
446  		}
447  	}
448  	return i;
449  }
450  
451  static int prepare_arg_pages(struct linux_binprm *bprm,
452  			struct user_arg_ptr argv, struct user_arg_ptr envp)
453  {
454  	unsigned long limit, ptr_size;
455  
456  	bprm->argc = count(argv, MAX_ARG_STRINGS);
457  	if (bprm->argc < 0)
458  		return bprm->argc;
459  
460  	bprm->envc = count(envp, MAX_ARG_STRINGS);
461  	if (bprm->envc < 0)
462  		return bprm->envc;
463  
464  	/*
465  	 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
466  	 * (whichever is smaller) for the argv+env strings.
467  	 * This ensures that:
468  	 *  - the remaining binfmt code will not run out of stack space,
469  	 *  - the program will have a reasonable amount of stack left
470  	 *    to work from.
471  	 */
472  	limit = _STK_LIM / 4 * 3;
473  	limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
474  	/*
475  	 * We've historically supported up to 32 pages (ARG_MAX)
476  	 * of argument strings even with small stacks
477  	 */
478  	limit = max_t(unsigned long, limit, ARG_MAX);
479  	/*
480  	 * We must account for the size of all the argv and envp pointers to
481  	 * the argv and envp strings, since they will also take up space in
482  	 * the stack. They aren't stored until much later when we can't
483  	 * signal to the parent that the child has run out of stack space.
484  	 * Instead, calculate it here so it's possible to fail gracefully.
485  	 */
486  	ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
487  	if (limit <= ptr_size)
488  		return -E2BIG;
489  	limit -= ptr_size;
490  
491  	bprm->argmin = bprm->p - limit;
492  	return 0;
493  }
494  
495  /*
496   * 'copy_strings()' copies argument/environment strings from the old
497   * processes's memory to the new process's stack.  The call to get_user_pages()
498   * ensures the destination page is created and not swapped out.
499   */
500  static int copy_strings(int argc, struct user_arg_ptr argv,
501  			struct linux_binprm *bprm)
502  {
503  	struct page *kmapped_page = NULL;
504  	char *kaddr = NULL;
505  	unsigned long kpos = 0;
506  	int ret;
507  
508  	while (argc-- > 0) {
509  		const char __user *str;
510  		int len;
511  		unsigned long pos;
512  
513  		ret = -EFAULT;
514  		str = get_user_arg_ptr(argv, argc);
515  		if (IS_ERR(str))
516  			goto out;
517  
518  		len = strnlen_user(str, MAX_ARG_STRLEN);
519  		if (!len)
520  			goto out;
521  
522  		ret = -E2BIG;
523  		if (!valid_arg_len(bprm, len))
524  			goto out;
525  
526  		/* We're going to work our way backwords. */
527  		pos = bprm->p;
528  		str += len;
529  		bprm->p -= len;
530  #ifdef CONFIG_MMU
531  		if (bprm->p < bprm->argmin)
532  			goto out;
533  #endif
534  
535  		while (len > 0) {
536  			int offset, bytes_to_copy;
537  
538  			if (fatal_signal_pending(current)) {
539  				ret = -ERESTARTNOHAND;
540  				goto out;
541  			}
542  			cond_resched();
543  
544  			offset = pos % PAGE_SIZE;
545  			if (offset == 0)
546  				offset = PAGE_SIZE;
547  
548  			bytes_to_copy = offset;
549  			if (bytes_to_copy > len)
550  				bytes_to_copy = len;
551  
552  			offset -= bytes_to_copy;
553  			pos -= bytes_to_copy;
554  			str -= bytes_to_copy;
555  			len -= bytes_to_copy;
556  
557  			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
558  				struct page *page;
559  
560  				page = get_arg_page(bprm, pos, 1);
561  				if (!page) {
562  					ret = -E2BIG;
563  					goto out;
564  				}
565  
566  				if (kmapped_page) {
567  					flush_kernel_dcache_page(kmapped_page);
568  					kunmap(kmapped_page);
569  					put_arg_page(kmapped_page);
570  				}
571  				kmapped_page = page;
572  				kaddr = kmap(kmapped_page);
573  				kpos = pos & PAGE_MASK;
574  				flush_arg_page(bprm, kpos, kmapped_page);
575  			}
576  			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
577  				ret = -EFAULT;
578  				goto out;
579  			}
580  		}
581  	}
582  	ret = 0;
583  out:
584  	if (kmapped_page) {
585  		flush_kernel_dcache_page(kmapped_page);
586  		kunmap(kmapped_page);
587  		put_arg_page(kmapped_page);
588  	}
589  	return ret;
590  }
591  
592  /*
593   * Like copy_strings, but get argv and its values from kernel memory.
594   */
595  int copy_strings_kernel(int argc, const char *const *__argv,
596  			struct linux_binprm *bprm)
597  {
598  	int r;
599  	mm_segment_t oldfs = get_fs();
600  	struct user_arg_ptr argv = {
601  		.ptr.native = (const char __user *const  __user *)__argv,
602  	};
603  
604  	set_fs(KERNEL_DS);
605  	r = copy_strings(argc, argv, bprm);
606  	set_fs(oldfs);
607  
608  	return r;
609  }
610  EXPORT_SYMBOL(copy_strings_kernel);
611  
612  #ifdef CONFIG_MMU
613  
614  /*
615   * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
616   * the binfmt code determines where the new stack should reside, we shift it to
617   * its final location.  The process proceeds as follows:
618   *
619   * 1) Use shift to calculate the new vma endpoints.
620   * 2) Extend vma to cover both the old and new ranges.  This ensures the
621   *    arguments passed to subsequent functions are consistent.
622   * 3) Move vma's page tables to the new range.
623   * 4) Free up any cleared pgd range.
624   * 5) Shrink the vma to cover only the new range.
625   */
626  static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
627  {
628  	struct mm_struct *mm = vma->vm_mm;
629  	unsigned long old_start = vma->vm_start;
630  	unsigned long old_end = vma->vm_end;
631  	unsigned long length = old_end - old_start;
632  	unsigned long new_start = old_start - shift;
633  	unsigned long new_end = old_end - shift;
634  	struct mmu_gather tlb;
635  
636  	BUG_ON(new_start > new_end);
637  
638  	/*
639  	 * ensure there are no vmas between where we want to go
640  	 * and where we are
641  	 */
642  	if (vma != find_vma(mm, new_start))
643  		return -EFAULT;
644  
645  	/*
646  	 * cover the whole range: [new_start, old_end)
647  	 */
648  	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
649  		return -ENOMEM;
650  
651  	/*
652  	 * move the page tables downwards, on failure we rely on
653  	 * process cleanup to remove whatever mess we made.
654  	 */
655  	if (length != move_page_tables(vma, old_start,
656  				       vma, new_start, length, false))
657  		return -ENOMEM;
658  
659  	lru_add_drain();
660  	tlb_gather_mmu(&tlb, mm, old_start, old_end);
661  	if (new_end > old_start) {
662  		/*
663  		 * when the old and new regions overlap clear from new_end.
664  		 */
665  		free_pgd_range(&tlb, new_end, old_end, new_end,
666  			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
667  	} else {
668  		/*
669  		 * otherwise, clean from old_start; this is done to not touch
670  		 * the address space in [new_end, old_start) some architectures
671  		 * have constraints on va-space that make this illegal (IA64) -
672  		 * for the others its just a little faster.
673  		 */
674  		free_pgd_range(&tlb, old_start, old_end, new_end,
675  			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
676  	}
677  	tlb_finish_mmu(&tlb, old_start, old_end);
678  
679  	/*
680  	 * Shrink the vma to just the new range.  Always succeeds.
681  	 */
682  	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
683  
684  	return 0;
685  }
686  
687  /*
688   * Finalizes the stack vm_area_struct. The flags and permissions are updated,
689   * the stack is optionally relocated, and some extra space is added.
690   */
691  int setup_arg_pages(struct linux_binprm *bprm,
692  		    unsigned long stack_top,
693  		    int executable_stack)
694  {
695  	unsigned long ret;
696  	unsigned long stack_shift;
697  	struct mm_struct *mm = current->mm;
698  	struct vm_area_struct *vma = bprm->vma;
699  	struct vm_area_struct *prev = NULL;
700  	unsigned long vm_flags;
701  	unsigned long stack_base;
702  	unsigned long stack_size;
703  	unsigned long stack_expand;
704  	unsigned long rlim_stack;
705  
706  #ifdef CONFIG_STACK_GROWSUP
707  	/* Limit stack size */
708  	stack_base = bprm->rlim_stack.rlim_max;
709  	if (stack_base > STACK_SIZE_MAX)
710  		stack_base = STACK_SIZE_MAX;
711  
712  	/* Add space for stack randomization. */
713  	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
714  
715  	/* Make sure we didn't let the argument array grow too large. */
716  	if (vma->vm_end - vma->vm_start > stack_base)
717  		return -ENOMEM;
718  
719  	stack_base = PAGE_ALIGN(stack_top - stack_base);
720  
721  	stack_shift = vma->vm_start - stack_base;
722  	mm->arg_start = bprm->p - stack_shift;
723  	bprm->p = vma->vm_end - stack_shift;
724  #else
725  	stack_top = arch_align_stack(stack_top);
726  	stack_top = PAGE_ALIGN(stack_top);
727  
728  	if (unlikely(stack_top < mmap_min_addr) ||
729  	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
730  		return -ENOMEM;
731  
732  	stack_shift = vma->vm_end - stack_top;
733  
734  	bprm->p -= stack_shift;
735  	mm->arg_start = bprm->p;
736  #endif
737  
738  	if (bprm->loader)
739  		bprm->loader -= stack_shift;
740  	bprm->exec -= stack_shift;
741  
742  	if (down_write_killable(&mm->mmap_sem))
743  		return -EINTR;
744  
745  	vm_flags = VM_STACK_FLAGS;
746  
747  	/*
748  	 * Adjust stack execute permissions; explicitly enable for
749  	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
750  	 * (arch default) otherwise.
751  	 */
752  	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
753  		vm_flags |= VM_EXEC;
754  	else if (executable_stack == EXSTACK_DISABLE_X)
755  		vm_flags &= ~VM_EXEC;
756  	vm_flags |= mm->def_flags;
757  	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
758  
759  	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
760  			vm_flags);
761  	if (ret)
762  		goto out_unlock;
763  	BUG_ON(prev != vma);
764  
765  	/* Move stack pages down in memory. */
766  	if (stack_shift) {
767  		ret = shift_arg_pages(vma, stack_shift);
768  		if (ret)
769  			goto out_unlock;
770  	}
771  
772  	/* mprotect_fixup is overkill to remove the temporary stack flags */
773  	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
774  
775  	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
776  	stack_size = vma->vm_end - vma->vm_start;
777  	/*
778  	 * Align this down to a page boundary as expand_stack
779  	 * will align it up.
780  	 */
781  	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
782  #ifdef CONFIG_STACK_GROWSUP
783  	if (stack_size + stack_expand > rlim_stack)
784  		stack_base = vma->vm_start + rlim_stack;
785  	else
786  		stack_base = vma->vm_end + stack_expand;
787  #else
788  	if (stack_size + stack_expand > rlim_stack)
789  		stack_base = vma->vm_end - rlim_stack;
790  	else
791  		stack_base = vma->vm_start - stack_expand;
792  #endif
793  	current->mm->start_stack = bprm->p;
794  	ret = expand_stack(vma, stack_base);
795  	if (ret)
796  		ret = -EFAULT;
797  
798  out_unlock:
799  	up_write(&mm->mmap_sem);
800  	return ret;
801  }
802  EXPORT_SYMBOL(setup_arg_pages);
803  
804  #else
805  
806  /*
807   * Transfer the program arguments and environment from the holding pages
808   * onto the stack. The provided stack pointer is adjusted accordingly.
809   */
810  int transfer_args_to_stack(struct linux_binprm *bprm,
811  			   unsigned long *sp_location)
812  {
813  	unsigned long index, stop, sp;
814  	int ret = 0;
815  
816  	stop = bprm->p >> PAGE_SHIFT;
817  	sp = *sp_location;
818  
819  	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
820  		unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
821  		char *src = kmap(bprm->page[index]) + offset;
822  		sp -= PAGE_SIZE - offset;
823  		if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
824  			ret = -EFAULT;
825  		kunmap(bprm->page[index]);
826  		if (ret)
827  			goto out;
828  	}
829  
830  	*sp_location = sp;
831  
832  out:
833  	return ret;
834  }
835  EXPORT_SYMBOL(transfer_args_to_stack);
836  
837  #endif /* CONFIG_MMU */
838  
839  static struct file *do_open_execat(int fd, struct filename *name, int flags)
840  {
841  	struct file *file;
842  	int err;
843  	struct open_flags open_exec_flags = {
844  		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
845  		.acc_mode = MAY_EXEC,
846  		.intent = LOOKUP_OPEN,
847  		.lookup_flags = LOOKUP_FOLLOW,
848  	};
849  
850  	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
851  		return ERR_PTR(-EINVAL);
852  	if (flags & AT_SYMLINK_NOFOLLOW)
853  		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
854  	if (flags & AT_EMPTY_PATH)
855  		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
856  
857  	file = do_filp_open(fd, name, &open_exec_flags);
858  	if (IS_ERR(file))
859  		goto out;
860  
861  	err = -EACCES;
862  	if (!S_ISREG(file_inode(file)->i_mode))
863  		goto exit;
864  
865  	if (path_noexec(&file->f_path))
866  		goto exit;
867  
868  	err = deny_write_access(file);
869  	if (err)
870  		goto exit;
871  
872  	if (name->name[0] != '\0')
873  		fsnotify_open(file);
874  
875  out:
876  	return file;
877  
878  exit:
879  	fput(file);
880  	return ERR_PTR(err);
881  }
882  
883  struct file *open_exec(const char *name)
884  {
885  	struct filename *filename = getname_kernel(name);
886  	struct file *f = ERR_CAST(filename);
887  
888  	if (!IS_ERR(filename)) {
889  		f = do_open_execat(AT_FDCWD, filename, 0);
890  		putname(filename);
891  	}
892  	return f;
893  }
894  EXPORT_SYMBOL(open_exec);
895  
896  int kernel_read_file(struct file *file, void **buf, loff_t *size,
897  		     loff_t max_size, enum kernel_read_file_id id)
898  {
899  	loff_t i_size, pos;
900  	ssize_t bytes = 0;
901  	int ret;
902  
903  	if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
904  		return -EINVAL;
905  
906  	ret = deny_write_access(file);
907  	if (ret)
908  		return ret;
909  
910  	ret = security_kernel_read_file(file, id);
911  	if (ret)
912  		goto out;
913  
914  	i_size = i_size_read(file_inode(file));
915  	if (i_size <= 0) {
916  		ret = -EINVAL;
917  		goto out;
918  	}
919  	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
920  		ret = -EFBIG;
921  		goto out;
922  	}
923  
924  	if (id != READING_FIRMWARE_PREALLOC_BUFFER)
925  		*buf = vmalloc(i_size);
926  	if (!*buf) {
927  		ret = -ENOMEM;
928  		goto out;
929  	}
930  
931  	pos = 0;
932  	while (pos < i_size) {
933  		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
934  		if (bytes < 0) {
935  			ret = bytes;
936  			goto out_free;
937  		}
938  
939  		if (bytes == 0)
940  			break;
941  	}
942  
943  	if (pos != i_size) {
944  		ret = -EIO;
945  		goto out_free;
946  	}
947  
948  	ret = security_kernel_post_read_file(file, *buf, i_size, id);
949  	if (!ret)
950  		*size = pos;
951  
952  out_free:
953  	if (ret < 0) {
954  		if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
955  			vfree(*buf);
956  			*buf = NULL;
957  		}
958  	}
959  
960  out:
961  	allow_write_access(file);
962  	return ret;
963  }
964  EXPORT_SYMBOL_GPL(kernel_read_file);
965  
966  int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
967  			       loff_t max_size, enum kernel_read_file_id id)
968  {
969  	struct file *file;
970  	int ret;
971  
972  	if (!path || !*path)
973  		return -EINVAL;
974  
975  	file = filp_open(path, O_RDONLY, 0);
976  	if (IS_ERR(file))
977  		return PTR_ERR(file);
978  
979  	ret = kernel_read_file(file, buf, size, max_size, id);
980  	fput(file);
981  	return ret;
982  }
983  EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
984  
985  int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
986  			     enum kernel_read_file_id id)
987  {
988  	struct fd f = fdget(fd);
989  	int ret = -EBADF;
990  
991  	if (!f.file)
992  		goto out;
993  
994  	ret = kernel_read_file(f.file, buf, size, max_size, id);
995  out:
996  	fdput(f);
997  	return ret;
998  }
999  EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
1000  
1001  ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
1002  {
1003  	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
1004  	if (res > 0)
1005  		flush_icache_range(addr, addr + len);
1006  	return res;
1007  }
1008  EXPORT_SYMBOL(read_code);
1009  
1010  static int exec_mmap(struct mm_struct *mm)
1011  {
1012  	struct task_struct *tsk;
1013  	struct mm_struct *old_mm, *active_mm;
1014  
1015  	/* Notify parent that we're no longer interested in the old VM */
1016  	tsk = current;
1017  	old_mm = current->mm;
1018  	mm_release(tsk, old_mm);
1019  
1020  	if (old_mm) {
1021  		sync_mm_rss(old_mm);
1022  		/*
1023  		 * Make sure that if there is a core dump in progress
1024  		 * for the old mm, we get out and die instead of going
1025  		 * through with the exec.  We must hold mmap_sem around
1026  		 * checking core_state and changing tsk->mm.
1027  		 */
1028  		down_read(&old_mm->mmap_sem);
1029  		if (unlikely(old_mm->core_state)) {
1030  			up_read(&old_mm->mmap_sem);
1031  			return -EINTR;
1032  		}
1033  	}
1034  	task_lock(tsk);
1035  	active_mm = tsk->active_mm;
1036  	tsk->mm = mm;
1037  	tsk->active_mm = mm;
1038  	activate_mm(active_mm, mm);
1039  	tsk->mm->vmacache_seqnum = 0;
1040  	vmacache_flush(tsk);
1041  	task_unlock(tsk);
1042  	if (old_mm) {
1043  		up_read(&old_mm->mmap_sem);
1044  		BUG_ON(active_mm != old_mm);
1045  		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
1046  		mm_update_next_owner(old_mm);
1047  		mmput(old_mm);
1048  		return 0;
1049  	}
1050  	mmdrop(active_mm);
1051  	return 0;
1052  }
1053  
1054  /*
1055   * This function makes sure the current process has its own signal table,
1056   * so that flush_signal_handlers can later reset the handlers without
1057   * disturbing other processes.  (Other processes might share the signal
1058   * table via the CLONE_SIGHAND option to clone().)
1059   */
1060  static int de_thread(struct task_struct *tsk)
1061  {
1062  	struct signal_struct *sig = tsk->signal;
1063  	struct sighand_struct *oldsighand = tsk->sighand;
1064  	spinlock_t *lock = &oldsighand->siglock;
1065  
1066  	if (thread_group_empty(tsk))
1067  		goto no_thread_group;
1068  
1069  	/*
1070  	 * Kill all other threads in the thread group.
1071  	 */
1072  	spin_lock_irq(lock);
1073  	if (signal_group_exit(sig)) {
1074  		/*
1075  		 * Another group action in progress, just
1076  		 * return so that the signal is processed.
1077  		 */
1078  		spin_unlock_irq(lock);
1079  		return -EAGAIN;
1080  	}
1081  
1082  	sig->group_exit_task = tsk;
1083  	sig->notify_count = zap_other_threads(tsk);
1084  	if (!thread_group_leader(tsk))
1085  		sig->notify_count--;
1086  
1087  	while (sig->notify_count) {
1088  		__set_current_state(TASK_KILLABLE);
1089  		spin_unlock_irq(lock);
1090  		schedule();
1091  		if (__fatal_signal_pending(tsk))
1092  			goto killed;
1093  		spin_lock_irq(lock);
1094  	}
1095  	spin_unlock_irq(lock);
1096  
1097  	/*
1098  	 * At this point all other threads have exited, all we have to
1099  	 * do is to wait for the thread group leader to become inactive,
1100  	 * and to assume its PID:
1101  	 */
1102  	if (!thread_group_leader(tsk)) {
1103  		struct task_struct *leader = tsk->group_leader;
1104  
1105  		for (;;) {
1106  			cgroup_threadgroup_change_begin(tsk);
1107  			write_lock_irq(&tasklist_lock);
1108  			/*
1109  			 * Do this under tasklist_lock to ensure that
1110  			 * exit_notify() can't miss ->group_exit_task
1111  			 */
1112  			sig->notify_count = -1;
1113  			if (likely(leader->exit_state))
1114  				break;
1115  			__set_current_state(TASK_KILLABLE);
1116  			write_unlock_irq(&tasklist_lock);
1117  			cgroup_threadgroup_change_end(tsk);
1118  			schedule();
1119  			if (__fatal_signal_pending(tsk))
1120  				goto killed;
1121  		}
1122  
1123  		/*
1124  		 * The only record we have of the real-time age of a
1125  		 * process, regardless of execs it's done, is start_time.
1126  		 * All the past CPU time is accumulated in signal_struct
1127  		 * from sister threads now dead.  But in this non-leader
1128  		 * exec, nothing survives from the original leader thread,
1129  		 * whose birth marks the true age of this process now.
1130  		 * When we take on its identity by switching to its PID, we
1131  		 * also take its birthdate (always earlier than our own).
1132  		 */
1133  		tsk->start_time = leader->start_time;
1134  		tsk->real_start_time = leader->real_start_time;
1135  
1136  		BUG_ON(!same_thread_group(leader, tsk));
1137  		BUG_ON(has_group_leader_pid(tsk));
1138  		/*
1139  		 * An exec() starts a new thread group with the
1140  		 * TGID of the previous thread group. Rehash the
1141  		 * two threads with a switched PID, and release
1142  		 * the former thread group leader:
1143  		 */
1144  
1145  		/* Become a process group leader with the old leader's pid.
1146  		 * The old leader becomes a thread of the this thread group.
1147  		 * Note: The old leader also uses this pid until release_task
1148  		 *       is called.  Odd but simple and correct.
1149  		 */
1150  		tsk->pid = leader->pid;
1151  		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
1152  		transfer_pid(leader, tsk, PIDTYPE_TGID);
1153  		transfer_pid(leader, tsk, PIDTYPE_PGID);
1154  		transfer_pid(leader, tsk, PIDTYPE_SID);
1155  
1156  		list_replace_rcu(&leader->tasks, &tsk->tasks);
1157  		list_replace_init(&leader->sibling, &tsk->sibling);
1158  
1159  		tsk->group_leader = tsk;
1160  		leader->group_leader = tsk;
1161  
1162  		tsk->exit_signal = SIGCHLD;
1163  		leader->exit_signal = -1;
1164  
1165  		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1166  		leader->exit_state = EXIT_DEAD;
1167  
1168  		/*
1169  		 * We are going to release_task()->ptrace_unlink() silently,
1170  		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1171  		 * the tracer wont't block again waiting for this thread.
1172  		 */
1173  		if (unlikely(leader->ptrace))
1174  			__wake_up_parent(leader, leader->parent);
1175  		write_unlock_irq(&tasklist_lock);
1176  		cgroup_threadgroup_change_end(tsk);
1177  
1178  		release_task(leader);
1179  	}
1180  
1181  	sig->group_exit_task = NULL;
1182  	sig->notify_count = 0;
1183  
1184  no_thread_group:
1185  	/* we have changed execution domain */
1186  	tsk->exit_signal = SIGCHLD;
1187  
1188  #ifdef CONFIG_POSIX_TIMERS
1189  	exit_itimers(sig);
1190  	flush_itimer_signals();
1191  #endif
1192  
1193  	if (refcount_read(&oldsighand->count) != 1) {
1194  		struct sighand_struct *newsighand;
1195  		/*
1196  		 * This ->sighand is shared with the CLONE_SIGHAND
1197  		 * but not CLONE_THREAD task, switch to the new one.
1198  		 */
1199  		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1200  		if (!newsighand)
1201  			return -ENOMEM;
1202  
1203  		refcount_set(&newsighand->count, 1);
1204  		memcpy(newsighand->action, oldsighand->action,
1205  		       sizeof(newsighand->action));
1206  
1207  		write_lock_irq(&tasklist_lock);
1208  		spin_lock(&oldsighand->siglock);
1209  		rcu_assign_pointer(tsk->sighand, newsighand);
1210  		spin_unlock(&oldsighand->siglock);
1211  		write_unlock_irq(&tasklist_lock);
1212  
1213  		__cleanup_sighand(oldsighand);
1214  	}
1215  
1216  	BUG_ON(!thread_group_leader(tsk));
1217  	return 0;
1218  
1219  killed:
1220  	/* protects against exit_notify() and __exit_signal() */
1221  	read_lock(&tasklist_lock);
1222  	sig->group_exit_task = NULL;
1223  	sig->notify_count = 0;
1224  	read_unlock(&tasklist_lock);
1225  	return -EAGAIN;
1226  }
1227  
1228  char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
1229  {
1230  	task_lock(tsk);
1231  	strncpy(buf, tsk->comm, buf_size);
1232  	task_unlock(tsk);
1233  	return buf;
1234  }
1235  EXPORT_SYMBOL_GPL(__get_task_comm);
1236  
1237  /*
1238   * These functions flushes out all traces of the currently running executable
1239   * so that a new one can be started
1240   */
1241  
1242  void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
1243  {
1244  	task_lock(tsk);
1245  	trace_task_rename(tsk, buf);
1246  	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1247  	task_unlock(tsk);
1248  	perf_event_comm(tsk, exec);
1249  }
1250  
1251  /*
1252   * Calling this is the point of no return. None of the failures will be
1253   * seen by userspace since either the process is already taking a fatal
1254   * signal (via de_thread() or coredump), or will have SEGV raised
1255   * (after exec_mmap()) by search_binary_handlers (see below).
1256   */
1257  int flush_old_exec(struct linux_binprm * bprm)
1258  {
1259  	int retval;
1260  
1261  	/*
1262  	 * Make sure we have a private signal table and that
1263  	 * we are unassociated from the previous thread group.
1264  	 */
1265  	retval = de_thread(current);
1266  	if (retval)
1267  		goto out;
1268  
1269  	/*
1270  	 * Must be called _before_ exec_mmap() as bprm->mm is
1271  	 * not visibile until then. This also enables the update
1272  	 * to be lockless.
1273  	 */
1274  	set_mm_exe_file(bprm->mm, bprm->file);
1275  
1276  	/*
1277  	 * Release all of the old mmap stuff
1278  	 */
1279  	acct_arg_size(bprm, 0);
1280  	retval = exec_mmap(bprm->mm);
1281  	if (retval)
1282  		goto out;
1283  
1284  	/*
1285  	 * After clearing bprm->mm (to mark that current is using the
1286  	 * prepared mm now), we have nothing left of the original
1287  	 * process. If anything from here on returns an error, the check
1288  	 * in search_binary_handler() will SEGV current.
1289  	 */
1290  	bprm->mm = NULL;
1291  
1292  	set_fs(USER_DS);
1293  	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1294  					PF_NOFREEZE | PF_NO_SETAFFINITY);
1295  	flush_thread();
1296  	current->personality &= ~bprm->per_clear;
1297  
1298  	/*
1299  	 * We have to apply CLOEXEC before we change whether the process is
1300  	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
1301  	 * trying to access the should-be-closed file descriptors of a process
1302  	 * undergoing exec(2).
1303  	 */
1304  	do_close_on_exec(current->files);
1305  	return 0;
1306  
1307  out:
1308  	return retval;
1309  }
1310  EXPORT_SYMBOL(flush_old_exec);
1311  
1312  void would_dump(struct linux_binprm *bprm, struct file *file)
1313  {
1314  	struct inode *inode = file_inode(file);
1315  	if (inode_permission(inode, MAY_READ) < 0) {
1316  		struct user_namespace *old, *user_ns;
1317  		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1318  
1319  		/* Ensure mm->user_ns contains the executable */
1320  		user_ns = old = bprm->mm->user_ns;
1321  		while ((user_ns != &init_user_ns) &&
1322  		       !privileged_wrt_inode_uidgid(user_ns, inode))
1323  			user_ns = user_ns->parent;
1324  
1325  		if (old != user_ns) {
1326  			bprm->mm->user_ns = get_user_ns(user_ns);
1327  			put_user_ns(old);
1328  		}
1329  	}
1330  }
1331  EXPORT_SYMBOL(would_dump);
1332  
1333  void setup_new_exec(struct linux_binprm * bprm)
1334  {
1335  	/*
1336  	 * Once here, prepare_binrpm() will not be called any more, so
1337  	 * the final state of setuid/setgid/fscaps can be merged into the
1338  	 * secureexec flag.
1339  	 */
1340  	bprm->secureexec |= bprm->cap_elevated;
1341  
1342  	if (bprm->secureexec) {
1343  		/* Make sure parent cannot signal privileged process. */
1344  		current->pdeath_signal = 0;
1345  
1346  		/*
1347  		 * For secureexec, reset the stack limit to sane default to
1348  		 * avoid bad behavior from the prior rlimits. This has to
1349  		 * happen before arch_pick_mmap_layout(), which examines
1350  		 * RLIMIT_STACK, but after the point of no return to avoid
1351  		 * needing to clean up the change on failure.
1352  		 */
1353  		if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1354  			bprm->rlim_stack.rlim_cur = _STK_LIM;
1355  	}
1356  
1357  	arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
1358  
1359  	current->sas_ss_sp = current->sas_ss_size = 0;
1360  
1361  	/*
1362  	 * Figure out dumpability. Note that this checking only of current
1363  	 * is wrong, but userspace depends on it. This should be testing
1364  	 * bprm->secureexec instead.
1365  	 */
1366  	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1367  	    !(uid_eq(current_euid(), current_uid()) &&
1368  	      gid_eq(current_egid(), current_gid())))
1369  		set_dumpable(current->mm, suid_dumpable);
1370  	else
1371  		set_dumpable(current->mm, SUID_DUMP_USER);
1372  
1373  	arch_setup_new_exec();
1374  	perf_event_exec();
1375  	__set_task_comm(current, kbasename(bprm->filename), true);
1376  
1377  	/* Set the new mm task size. We have to do that late because it may
1378  	 * depend on TIF_32BIT which is only updated in flush_thread() on
1379  	 * some architectures like powerpc
1380  	 */
1381  	current->mm->task_size = TASK_SIZE;
1382  
1383  	/* An exec changes our domain. We are no longer part of the thread
1384  	   group */
1385  	current->self_exec_id++;
1386  	flush_signal_handlers(current, 0);
1387  }
1388  EXPORT_SYMBOL(setup_new_exec);
1389  
1390  /* Runs immediately before start_thread() takes over. */
1391  void finalize_exec(struct linux_binprm *bprm)
1392  {
1393  	/* Store any stack rlimit changes before starting thread. */
1394  	task_lock(current->group_leader);
1395  	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1396  	task_unlock(current->group_leader);
1397  }
1398  EXPORT_SYMBOL(finalize_exec);
1399  
1400  /*
1401   * Prepare credentials and lock ->cred_guard_mutex.
1402   * install_exec_creds() commits the new creds and drops the lock.
1403   * Or, if exec fails before, free_bprm() should release ->cred and
1404   * and unlock.
1405   */
1406  static int prepare_bprm_creds(struct linux_binprm *bprm)
1407  {
1408  	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1409  		return -ERESTARTNOINTR;
1410  
1411  	bprm->cred = prepare_exec_creds();
1412  	if (likely(bprm->cred))
1413  		return 0;
1414  
1415  	mutex_unlock(&current->signal->cred_guard_mutex);
1416  	return -ENOMEM;
1417  }
1418  
1419  static void free_bprm(struct linux_binprm *bprm)
1420  {
1421  	free_arg_pages(bprm);
1422  	if (bprm->cred) {
1423  		mutex_unlock(&current->signal->cred_guard_mutex);
1424  		abort_creds(bprm->cred);
1425  	}
1426  	if (bprm->file) {
1427  		allow_write_access(bprm->file);
1428  		fput(bprm->file);
1429  	}
1430  	/* If a binfmt changed the interp, free it. */
1431  	if (bprm->interp != bprm->filename)
1432  		kfree(bprm->interp);
1433  	kfree(bprm);
1434  }
1435  
1436  int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
1437  {
1438  	/* If a binfmt changed the interp, free it first. */
1439  	if (bprm->interp != bprm->filename)
1440  		kfree(bprm->interp);
1441  	bprm->interp = kstrdup(interp, GFP_KERNEL);
1442  	if (!bprm->interp)
1443  		return -ENOMEM;
1444  	return 0;
1445  }
1446  EXPORT_SYMBOL(bprm_change_interp);
1447  
1448  /*
1449   * install the new credentials for this executable
1450   */
1451  void install_exec_creds(struct linux_binprm *bprm)
1452  {
1453  	security_bprm_committing_creds(bprm);
1454  
1455  	commit_creds(bprm->cred);
1456  	bprm->cred = NULL;
1457  
1458  	/*
1459  	 * Disable monitoring for regular users
1460  	 * when executing setuid binaries. Must
1461  	 * wait until new credentials are committed
1462  	 * by commit_creds() above
1463  	 */
1464  	if (get_dumpable(current->mm) != SUID_DUMP_USER)
1465  		perf_event_exit_task(current);
1466  	/*
1467  	 * cred_guard_mutex must be held at least to this point to prevent
1468  	 * ptrace_attach() from altering our determination of the task's
1469  	 * credentials; any time after this it may be unlocked.
1470  	 */
1471  	security_bprm_committed_creds(bprm);
1472  	mutex_unlock(&current->signal->cred_guard_mutex);
1473  }
1474  EXPORT_SYMBOL(install_exec_creds);
1475  
1476  /*
1477   * determine how safe it is to execute the proposed program
1478   * - the caller must hold ->cred_guard_mutex to protect against
1479   *   PTRACE_ATTACH or seccomp thread-sync
1480   */
1481  static void check_unsafe_exec(struct linux_binprm *bprm)
1482  {
1483  	struct task_struct *p = current, *t;
1484  	unsigned n_fs;
1485  
1486  	if (p->ptrace)
1487  		bprm->unsafe |= LSM_UNSAFE_PTRACE;
1488  
1489  	/*
1490  	 * This isn't strictly necessary, but it makes it harder for LSMs to
1491  	 * mess up.
1492  	 */
1493  	if (task_no_new_privs(current))
1494  		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1495  
1496  	t = p;
1497  	n_fs = 1;
1498  	spin_lock(&p->fs->lock);
1499  	rcu_read_lock();
1500  	while_each_thread(p, t) {
1501  		if (t->fs == p->fs)
1502  			n_fs++;
1503  	}
1504  	rcu_read_unlock();
1505  
1506  	if (p->fs->users > n_fs)
1507  		bprm->unsafe |= LSM_UNSAFE_SHARE;
1508  	else
1509  		p->fs->in_exec = 1;
1510  	spin_unlock(&p->fs->lock);
1511  }
1512  
1513  static void bprm_fill_uid(struct linux_binprm *bprm)
1514  {
1515  	struct inode *inode;
1516  	unsigned int mode;
1517  	kuid_t uid;
1518  	kgid_t gid;
1519  
1520  	/*
1521  	 * Since this can be called multiple times (via prepare_binprm),
1522  	 * we must clear any previous work done when setting set[ug]id
1523  	 * bits from any earlier bprm->file uses (for example when run
1524  	 * first for a setuid script then again for its interpreter).
1525  	 */
1526  	bprm->cred->euid = current_euid();
1527  	bprm->cred->egid = current_egid();
1528  
1529  	if (!mnt_may_suid(bprm->file->f_path.mnt))
1530  		return;
1531  
1532  	if (task_no_new_privs(current))
1533  		return;
1534  
1535  	inode = bprm->file->f_path.dentry->d_inode;
1536  	mode = READ_ONCE(inode->i_mode);
1537  	if (!(mode & (S_ISUID|S_ISGID)))
1538  		return;
1539  
1540  	/* Be careful if suid/sgid is set */
1541  	inode_lock(inode);
1542  
1543  	/* reload atomically mode/uid/gid now that lock held */
1544  	mode = inode->i_mode;
1545  	uid = inode->i_uid;
1546  	gid = inode->i_gid;
1547  	inode_unlock(inode);
1548  
1549  	/* We ignore suid/sgid if there are no mappings for them in the ns */
1550  	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
1551  		 !kgid_has_mapping(bprm->cred->user_ns, gid))
1552  		return;
1553  
1554  	if (mode & S_ISUID) {
1555  		bprm->per_clear |= PER_CLEAR_ON_SETID;
1556  		bprm->cred->euid = uid;
1557  	}
1558  
1559  	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1560  		bprm->per_clear |= PER_CLEAR_ON_SETID;
1561  		bprm->cred->egid = gid;
1562  	}
1563  }
1564  
1565  /*
1566   * Fill the binprm structure from the inode.
1567   * Check permissions, then read the first BINPRM_BUF_SIZE bytes
1568   *
1569   * This may be called multiple times for binary chains (scripts for example).
1570   */
1571  int prepare_binprm(struct linux_binprm *bprm)
1572  {
1573  	int retval;
1574  	loff_t pos = 0;
1575  
1576  	bprm_fill_uid(bprm);
1577  
1578  	/* fill in binprm security blob */
1579  	retval = security_bprm_set_creds(bprm);
1580  	if (retval)
1581  		return retval;
1582  	bprm->called_set_creds = 1;
1583  
1584  	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1585  	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1586  }
1587  
1588  EXPORT_SYMBOL(prepare_binprm);
1589  
1590  /*
1591   * Arguments are '\0' separated strings found at the location bprm->p
1592   * points to; chop off the first by relocating brpm->p to right after
1593   * the first '\0' encountered.
1594   */
1595  int remove_arg_zero(struct linux_binprm *bprm)
1596  {
1597  	int ret = 0;
1598  	unsigned long offset;
1599  	char *kaddr;
1600  	struct page *page;
1601  
1602  	if (!bprm->argc)
1603  		return 0;
1604  
1605  	do {
1606  		offset = bprm->p & ~PAGE_MASK;
1607  		page = get_arg_page(bprm, bprm->p, 0);
1608  		if (!page) {
1609  			ret = -EFAULT;
1610  			goto out;
1611  		}
1612  		kaddr = kmap_atomic(page);
1613  
1614  		for (; offset < PAGE_SIZE && kaddr[offset];
1615  				offset++, bprm->p++)
1616  			;
1617  
1618  		kunmap_atomic(kaddr);
1619  		put_arg_page(page);
1620  	} while (offset == PAGE_SIZE);
1621  
1622  	bprm->p++;
1623  	bprm->argc--;
1624  	ret = 0;
1625  
1626  out:
1627  	return ret;
1628  }
1629  EXPORT_SYMBOL(remove_arg_zero);
1630  
1631  #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1632  /*
1633   * cycle the list of binary formats handler, until one recognizes the image
1634   */
1635  int search_binary_handler(struct linux_binprm *bprm)
1636  {
1637  	bool need_retry = IS_ENABLED(CONFIG_MODULES);
1638  	struct linux_binfmt *fmt;
1639  	int retval;
1640  
1641  	/* This allows 4 levels of binfmt rewrites before failing hard. */
1642  	if (bprm->recursion_depth > 5)
1643  		return -ELOOP;
1644  
1645  	retval = security_bprm_check(bprm);
1646  	if (retval)
1647  		return retval;
1648  
1649  	retval = -ENOENT;
1650   retry:
1651  	read_lock(&binfmt_lock);
1652  	list_for_each_entry(fmt, &formats, lh) {
1653  		if (!try_module_get(fmt->module))
1654  			continue;
1655  		read_unlock(&binfmt_lock);
1656  
1657  		bprm->recursion_depth++;
1658  		retval = fmt->load_binary(bprm);
1659  		bprm->recursion_depth--;
1660  
1661  		read_lock(&binfmt_lock);
1662  		put_binfmt(fmt);
1663  		if (retval < 0 && !bprm->mm) {
1664  			/* we got to flush_old_exec() and failed after it */
1665  			read_unlock(&binfmt_lock);
1666  			force_sigsegv(SIGSEGV, current);
1667  			return retval;
1668  		}
1669  		if (retval != -ENOEXEC || !bprm->file) {
1670  			read_unlock(&binfmt_lock);
1671  			return retval;
1672  		}
1673  	}
1674  	read_unlock(&binfmt_lock);
1675  
1676  	if (need_retry) {
1677  		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1678  		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
1679  			return retval;
1680  		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
1681  			return retval;
1682  		need_retry = false;
1683  		goto retry;
1684  	}
1685  
1686  	return retval;
1687  }
1688  EXPORT_SYMBOL(search_binary_handler);
1689  
1690  static int exec_binprm(struct linux_binprm *bprm)
1691  {
1692  	pid_t old_pid, old_vpid;
1693  	int ret;
1694  
1695  	/* Need to fetch pid before load_binary changes it */
1696  	old_pid = current->pid;
1697  	rcu_read_lock();
1698  	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1699  	rcu_read_unlock();
1700  
1701  	ret = search_binary_handler(bprm);
1702  	if (ret >= 0) {
1703  		audit_bprm(bprm);
1704  		trace_sched_process_exec(current, old_pid, bprm);
1705  		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1706  		proc_exec_connector(current);
1707  	}
1708  
1709  	return ret;
1710  }
1711  
1712  /*
1713   * sys_execve() executes a new program.
1714   */
1715  static int __do_execve_file(int fd, struct filename *filename,
1716  			    struct user_arg_ptr argv,
1717  			    struct user_arg_ptr envp,
1718  			    int flags, struct file *file)
1719  {
1720  	char *pathbuf = NULL;
1721  	struct linux_binprm *bprm;
1722  	struct files_struct *displaced;
1723  	int retval;
1724  
1725  	if (IS_ERR(filename))
1726  		return PTR_ERR(filename);
1727  
1728  	/*
1729  	 * We move the actual failure in case of RLIMIT_NPROC excess from
1730  	 * set*uid() to execve() because too many poorly written programs
1731  	 * don't check setuid() return code.  Here we additionally recheck
1732  	 * whether NPROC limit is still exceeded.
1733  	 */
1734  	if ((current->flags & PF_NPROC_EXCEEDED) &&
1735  	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1736  		retval = -EAGAIN;
1737  		goto out_ret;
1738  	}
1739  
1740  	/* We're below the limit (still or again), so we don't want to make
1741  	 * further execve() calls fail. */
1742  	current->flags &= ~PF_NPROC_EXCEEDED;
1743  
1744  	retval = unshare_files(&displaced);
1745  	if (retval)
1746  		goto out_ret;
1747  
1748  	retval = -ENOMEM;
1749  	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1750  	if (!bprm)
1751  		goto out_files;
1752  
1753  	retval = prepare_bprm_creds(bprm);
1754  	if (retval)
1755  		goto out_free;
1756  
1757  	check_unsafe_exec(bprm);
1758  	current->in_execve = 1;
1759  
1760  	if (!file)
1761  		file = do_open_execat(fd, filename, flags);
1762  	retval = PTR_ERR(file);
1763  	if (IS_ERR(file))
1764  		goto out_unmark;
1765  
1766  	sched_exec();
1767  
1768  	bprm->file = file;
1769  	if (!filename) {
1770  		bprm->filename = "none";
1771  	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
1772  		bprm->filename = filename->name;
1773  	} else {
1774  		if (filename->name[0] == '\0')
1775  			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1776  		else
1777  			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1778  					    fd, filename->name);
1779  		if (!pathbuf) {
1780  			retval = -ENOMEM;
1781  			goto out_unmark;
1782  		}
1783  		/*
1784  		 * Record that a name derived from an O_CLOEXEC fd will be
1785  		 * inaccessible after exec. Relies on having exclusive access to
1786  		 * current->files (due to unshare_files above).
1787  		 */
1788  		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1789  			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1790  		bprm->filename = pathbuf;
1791  	}
1792  	bprm->interp = bprm->filename;
1793  
1794  	retval = bprm_mm_init(bprm);
1795  	if (retval)
1796  		goto out_unmark;
1797  
1798  	retval = prepare_arg_pages(bprm, argv, envp);
1799  	if (retval < 0)
1800  		goto out;
1801  
1802  	retval = prepare_binprm(bprm);
1803  	if (retval < 0)
1804  		goto out;
1805  
1806  	retval = copy_strings_kernel(1, &bprm->filename, bprm);
1807  	if (retval < 0)
1808  		goto out;
1809  
1810  	bprm->exec = bprm->p;
1811  	retval = copy_strings(bprm->envc, envp, bprm);
1812  	if (retval < 0)
1813  		goto out;
1814  
1815  	retval = copy_strings(bprm->argc, argv, bprm);
1816  	if (retval < 0)
1817  		goto out;
1818  
1819  	would_dump(bprm, bprm->file);
1820  
1821  	retval = exec_binprm(bprm);
1822  	if (retval < 0)
1823  		goto out;
1824  
1825  	/* execve succeeded */
1826  	current->fs->in_exec = 0;
1827  	current->in_execve = 0;
1828  	membarrier_execve(current);
1829  	rseq_execve(current);
1830  	acct_update_integrals(current);
1831  	task_numa_free(current);
1832  	free_bprm(bprm);
1833  	kfree(pathbuf);
1834  	if (filename)
1835  		putname(filename);
1836  	if (displaced)
1837  		put_files_struct(displaced);
1838  	return retval;
1839  
1840  out:
1841  	if (bprm->mm) {
1842  		acct_arg_size(bprm, 0);
1843  		mmput(bprm->mm);
1844  	}
1845  
1846  out_unmark:
1847  	current->fs->in_exec = 0;
1848  	current->in_execve = 0;
1849  
1850  out_free:
1851  	free_bprm(bprm);
1852  	kfree(pathbuf);
1853  
1854  out_files:
1855  	if (displaced)
1856  		reset_files_struct(displaced);
1857  out_ret:
1858  	if (filename)
1859  		putname(filename);
1860  	return retval;
1861  }
1862  
1863  static int do_execveat_common(int fd, struct filename *filename,
1864  			      struct user_arg_ptr argv,
1865  			      struct user_arg_ptr envp,
1866  			      int flags)
1867  {
1868  	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
1869  }
1870  
1871  int do_execve_file(struct file *file, void *__argv, void *__envp)
1872  {
1873  	struct user_arg_ptr argv = { .ptr.native = __argv };
1874  	struct user_arg_ptr envp = { .ptr.native = __envp };
1875  
1876  	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
1877  }
1878  
1879  int do_execve(struct filename *filename,
1880  	const char __user *const __user *__argv,
1881  	const char __user *const __user *__envp)
1882  {
1883  	struct user_arg_ptr argv = { .ptr.native = __argv };
1884  	struct user_arg_ptr envp = { .ptr.native = __envp };
1885  	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1886  }
1887  
1888  int do_execveat(int fd, struct filename *filename,
1889  		const char __user *const __user *__argv,
1890  		const char __user *const __user *__envp,
1891  		int flags)
1892  {
1893  	struct user_arg_ptr argv = { .ptr.native = __argv };
1894  	struct user_arg_ptr envp = { .ptr.native = __envp };
1895  
1896  	return do_execveat_common(fd, filename, argv, envp, flags);
1897  }
1898  
1899  #ifdef CONFIG_COMPAT
1900  static int compat_do_execve(struct filename *filename,
1901  	const compat_uptr_t __user *__argv,
1902  	const compat_uptr_t __user *__envp)
1903  {
1904  	struct user_arg_ptr argv = {
1905  		.is_compat = true,
1906  		.ptr.compat = __argv,
1907  	};
1908  	struct user_arg_ptr envp = {
1909  		.is_compat = true,
1910  		.ptr.compat = __envp,
1911  	};
1912  	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1913  }
1914  
1915  static int compat_do_execveat(int fd, struct filename *filename,
1916  			      const compat_uptr_t __user *__argv,
1917  			      const compat_uptr_t __user *__envp,
1918  			      int flags)
1919  {
1920  	struct user_arg_ptr argv = {
1921  		.is_compat = true,
1922  		.ptr.compat = __argv,
1923  	};
1924  	struct user_arg_ptr envp = {
1925  		.is_compat = true,
1926  		.ptr.compat = __envp,
1927  	};
1928  	return do_execveat_common(fd, filename, argv, envp, flags);
1929  }
1930  #endif
1931  
1932  void set_binfmt(struct linux_binfmt *new)
1933  {
1934  	struct mm_struct *mm = current->mm;
1935  
1936  	if (mm->binfmt)
1937  		module_put(mm->binfmt->module);
1938  
1939  	mm->binfmt = new;
1940  	if (new)
1941  		__module_get(new->module);
1942  }
1943  EXPORT_SYMBOL(set_binfmt);
1944  
1945  /*
1946   * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1947   */
1948  void set_dumpable(struct mm_struct *mm, int value)
1949  {
1950  	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1951  		return;
1952  
1953  	set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
1954  }
1955  
1956  SYSCALL_DEFINE3(execve,
1957  		const char __user *, filename,
1958  		const char __user *const __user *, argv,
1959  		const char __user *const __user *, envp)
1960  {
1961  	return do_execve(getname(filename), argv, envp);
1962  }
1963  
1964  SYSCALL_DEFINE5(execveat,
1965  		int, fd, const char __user *, filename,
1966  		const char __user *const __user *, argv,
1967  		const char __user *const __user *, envp,
1968  		int, flags)
1969  {
1970  	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1971  
1972  	return do_execveat(fd,
1973  			   getname_flags(filename, lookup_flags, NULL),
1974  			   argv, envp, flags);
1975  }
1976  
1977  #ifdef CONFIG_COMPAT
1978  COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1979  	const compat_uptr_t __user *, argv,
1980  	const compat_uptr_t __user *, envp)
1981  {
1982  	return compat_do_execve(getname(filename), argv, envp);
1983  }
1984  
1985  COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
1986  		       const char __user *, filename,
1987  		       const compat_uptr_t __user *, argv,
1988  		       const compat_uptr_t __user *, envp,
1989  		       int,  flags)
1990  {
1991  	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1992  
1993  	return compat_do_execveat(fd,
1994  				  getname_flags(filename, lookup_flags, NULL),
1995  				  argv, envp, flags);
1996  }
1997  #endif
1998