xref: /openbmc/linux/fs/coredump.c (revision 84158b7f)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/file.h>
4 #include <linux/fdtable.h>
5 #include <linux/freezer.h>
6 #include <linux/mm.h>
7 #include <linux/stat.h>
8 #include <linux/fcntl.h>
9 #include <linux/swap.h>
10 #include <linux/ctype.h>
11 #include <linux/string.h>
12 #include <linux/init.h>
13 #include <linux/pagemap.h>
14 #include <linux/perf_event.h>
15 #include <linux/highmem.h>
16 #include <linux/spinlock.h>
17 #include <linux/key.h>
18 #include <linux/personality.h>
19 #include <linux/binfmts.h>
20 #include <linux/coredump.h>
21 #include <linux/sched/coredump.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/task_stack.h>
24 #include <linux/utsname.h>
25 #include <linux/pid_namespace.h>
26 #include <linux/module.h>
27 #include <linux/namei.h>
28 #include <linux/mount.h>
29 #include <linux/security.h>
30 #include <linux/syscalls.h>
31 #include <linux/tsacct_kern.h>
32 #include <linux/cn_proc.h>
33 #include <linux/audit.h>
34 #include <linux/tracehook.h>
35 #include <linux/kmod.h>
36 #include <linux/fsnotify.h>
37 #include <linux/fs_struct.h>
38 #include <linux/pipe_fs_i.h>
39 #include <linux/oom.h>
40 #include <linux/compat.h>
41 #include <linux/fs.h>
42 #include <linux/path.h>
43 #include <linux/timekeeping.h>
44 #include <linux/sysctl.h>
45 #include <linux/elf.h>
46 
47 #include <linux/uaccess.h>
48 #include <asm/mmu_context.h>
49 #include <asm/tlb.h>
50 #include <asm/exec.h>
51 
52 #include <trace/events/task.h>
53 #include "internal.h"
54 
55 #include <trace/events/sched.h>
56 
57 static int core_uses_pid;
58 static unsigned int core_pipe_limit;
59 static char core_pattern[CORENAME_MAX_SIZE] = "core";
60 static int core_name_size = CORENAME_MAX_SIZE;
61 
62 struct core_name {
63 	char *corename;
64 	int used, size;
65 };
66 
67 static int expand_corename(struct core_name *cn, int size)
68 {
69 	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
70 
71 	if (!corename)
72 		return -ENOMEM;
73 
74 	if (size > core_name_size) /* racy but harmless */
75 		core_name_size = size;
76 
77 	cn->size = ksize(corename);
78 	cn->corename = corename;
79 	return 0;
80 }
81 
82 static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
83 				     va_list arg)
84 {
85 	int free, need;
86 	va_list arg_copy;
87 
88 again:
89 	free = cn->size - cn->used;
90 
91 	va_copy(arg_copy, arg);
92 	need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
93 	va_end(arg_copy);
94 
95 	if (need < free) {
96 		cn->used += need;
97 		return 0;
98 	}
99 
100 	if (!expand_corename(cn, cn->size + need - free + 1))
101 		goto again;
102 
103 	return -ENOMEM;
104 }
105 
106 static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
107 {
108 	va_list arg;
109 	int ret;
110 
111 	va_start(arg, fmt);
112 	ret = cn_vprintf(cn, fmt, arg);
113 	va_end(arg);
114 
115 	return ret;
116 }
117 
118 static __printf(2, 3)
119 int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
120 {
121 	int cur = cn->used;
122 	va_list arg;
123 	int ret;
124 
125 	va_start(arg, fmt);
126 	ret = cn_vprintf(cn, fmt, arg);
127 	va_end(arg);
128 
129 	if (ret == 0) {
130 		/*
131 		 * Ensure that this coredump name component can't cause the
132 		 * resulting corefile path to consist of a ".." or ".".
133 		 */
134 		if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
135 				(cn->used - cur == 2 && cn->corename[cur] == '.'
136 				&& cn->corename[cur+1] == '.'))
137 			cn->corename[cur] = '!';
138 
139 		/*
140 		 * Empty names are fishy and could be used to create a "//" in a
141 		 * corefile name, causing the coredump to happen one directory
142 		 * level too high. Enforce that all components of the core
143 		 * pattern are at least one character long.
144 		 */
145 		if (cn->used == cur)
146 			ret = cn_printf(cn, "!");
147 	}
148 
149 	for (; cur < cn->used; ++cur) {
150 		if (cn->corename[cur] == '/')
151 			cn->corename[cur] = '!';
152 	}
153 	return ret;
154 }
155 
156 static int cn_print_exe_file(struct core_name *cn, bool name_only)
157 {
158 	struct file *exe_file;
159 	char *pathbuf, *path, *ptr;
160 	int ret;
161 
162 	exe_file = get_mm_exe_file(current->mm);
163 	if (!exe_file)
164 		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
165 
166 	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
167 	if (!pathbuf) {
168 		ret = -ENOMEM;
169 		goto put_exe_file;
170 	}
171 
172 	path = file_path(exe_file, pathbuf, PATH_MAX);
173 	if (IS_ERR(path)) {
174 		ret = PTR_ERR(path);
175 		goto free_buf;
176 	}
177 
178 	if (name_only) {
179 		ptr = strrchr(path, '/');
180 		if (ptr)
181 			path = ptr + 1;
182 	}
183 	ret = cn_esc_printf(cn, "%s", path);
184 
185 free_buf:
186 	kfree(pathbuf);
187 put_exe_file:
188 	fput(exe_file);
189 	return ret;
190 }
191 
192 /* format_corename will inspect the pattern parameter, and output a
193  * name into corename, which must have space for at least
194  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
195  */
196 static int format_corename(struct core_name *cn, struct coredump_params *cprm,
197 			   size_t **argv, int *argc)
198 {
199 	const struct cred *cred = current_cred();
200 	const char *pat_ptr = core_pattern;
201 	int ispipe = (*pat_ptr == '|');
202 	bool was_space = false;
203 	int pid_in_pattern = 0;
204 	int err = 0;
205 
206 	cn->used = 0;
207 	cn->corename = NULL;
208 	if (expand_corename(cn, core_name_size))
209 		return -ENOMEM;
210 	cn->corename[0] = '\0';
211 
212 	if (ispipe) {
213 		int argvs = sizeof(core_pattern) / 2;
214 		(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
215 		if (!(*argv))
216 			return -ENOMEM;
217 		(*argv)[(*argc)++] = 0;
218 		++pat_ptr;
219 		if (!(*pat_ptr))
220 			return -ENOMEM;
221 	}
222 
223 	/* Repeat as long as we have more pattern to process and more output
224 	   space */
225 	while (*pat_ptr) {
226 		/*
227 		 * Split on spaces before doing template expansion so that
228 		 * %e and %E don't get split if they have spaces in them
229 		 */
230 		if (ispipe) {
231 			if (isspace(*pat_ptr)) {
232 				if (cn->used != 0)
233 					was_space = true;
234 				pat_ptr++;
235 				continue;
236 			} else if (was_space) {
237 				was_space = false;
238 				err = cn_printf(cn, "%c", '\0');
239 				if (err)
240 					return err;
241 				(*argv)[(*argc)++] = cn->used;
242 			}
243 		}
244 		if (*pat_ptr != '%') {
245 			err = cn_printf(cn, "%c", *pat_ptr++);
246 		} else {
247 			switch (*++pat_ptr) {
248 			/* single % at the end, drop that */
249 			case 0:
250 				goto out;
251 			/* Double percent, output one percent */
252 			case '%':
253 				err = cn_printf(cn, "%c", '%');
254 				break;
255 			/* pid */
256 			case 'p':
257 				pid_in_pattern = 1;
258 				err = cn_printf(cn, "%d",
259 					      task_tgid_vnr(current));
260 				break;
261 			/* global pid */
262 			case 'P':
263 				err = cn_printf(cn, "%d",
264 					      task_tgid_nr(current));
265 				break;
266 			case 'i':
267 				err = cn_printf(cn, "%d",
268 					      task_pid_vnr(current));
269 				break;
270 			case 'I':
271 				err = cn_printf(cn, "%d",
272 					      task_pid_nr(current));
273 				break;
274 			/* uid */
275 			case 'u':
276 				err = cn_printf(cn, "%u",
277 						from_kuid(&init_user_ns,
278 							  cred->uid));
279 				break;
280 			/* gid */
281 			case 'g':
282 				err = cn_printf(cn, "%u",
283 						from_kgid(&init_user_ns,
284 							  cred->gid));
285 				break;
286 			case 'd':
287 				err = cn_printf(cn, "%d",
288 					__get_dumpable(cprm->mm_flags));
289 				break;
290 			/* signal that caused the coredump */
291 			case 's':
292 				err = cn_printf(cn, "%d",
293 						cprm->siginfo->si_signo);
294 				break;
295 			/* UNIX time of coredump */
296 			case 't': {
297 				time64_t time;
298 
299 				time = ktime_get_real_seconds();
300 				err = cn_printf(cn, "%lld", time);
301 				break;
302 			}
303 			/* hostname */
304 			case 'h':
305 				down_read(&uts_sem);
306 				err = cn_esc_printf(cn, "%s",
307 					      utsname()->nodename);
308 				up_read(&uts_sem);
309 				break;
310 			/* executable, could be changed by prctl PR_SET_NAME etc */
311 			case 'e':
312 				err = cn_esc_printf(cn, "%s", current->comm);
313 				break;
314 			/* file name of executable */
315 			case 'f':
316 				err = cn_print_exe_file(cn, true);
317 				break;
318 			case 'E':
319 				err = cn_print_exe_file(cn, false);
320 				break;
321 			/* core limit size */
322 			case 'c':
323 				err = cn_printf(cn, "%lu",
324 					      rlimit(RLIMIT_CORE));
325 				break;
326 			default:
327 				break;
328 			}
329 			++pat_ptr;
330 		}
331 
332 		if (err)
333 			return err;
334 	}
335 
336 out:
337 	/* Backward compatibility with core_uses_pid:
338 	 *
339 	 * If core_pattern does not include a %p (as is the default)
340 	 * and core_uses_pid is set, then .%pid will be appended to
341 	 * the filename. Do not do this for piped commands. */
342 	if (!ispipe && !pid_in_pattern && core_uses_pid) {
343 		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
344 		if (err)
345 			return err;
346 	}
347 	return ispipe;
348 }
349 
350 static int zap_process(struct task_struct *start, int exit_code)
351 {
352 	struct task_struct *t;
353 	int nr = 0;
354 
355 	/* ignore all signals except SIGKILL, see prepare_signal() */
356 	start->signal->flags = SIGNAL_GROUP_EXIT;
357 	start->signal->group_exit_code = exit_code;
358 	start->signal->group_stop_count = 0;
359 
360 	for_each_thread(start, t) {
361 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
362 		if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
363 			sigaddset(&t->pending.signal, SIGKILL);
364 			signal_wake_up(t, 1);
365 			nr++;
366 		}
367 	}
368 
369 	return nr;
370 }
371 
372 static int zap_threads(struct task_struct *tsk,
373 			struct core_state *core_state, int exit_code)
374 {
375 	struct signal_struct *signal = tsk->signal;
376 	int nr = -EAGAIN;
377 
378 	spin_lock_irq(&tsk->sighand->siglock);
379 	if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) {
380 		signal->core_state = core_state;
381 		nr = zap_process(tsk, exit_code);
382 		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
383 		tsk->flags |= PF_DUMPCORE;
384 		atomic_set(&core_state->nr_threads, nr);
385 	}
386 	spin_unlock_irq(&tsk->sighand->siglock);
387 	return nr;
388 }
389 
390 static int coredump_wait(int exit_code, struct core_state *core_state)
391 {
392 	struct task_struct *tsk = current;
393 	int core_waiters = -EBUSY;
394 
395 	init_completion(&core_state->startup);
396 	core_state->dumper.task = tsk;
397 	core_state->dumper.next = NULL;
398 
399 	core_waiters = zap_threads(tsk, core_state, exit_code);
400 	if (core_waiters > 0) {
401 		struct core_thread *ptr;
402 
403 		freezer_do_not_count();
404 		wait_for_completion(&core_state->startup);
405 		freezer_count();
406 		/*
407 		 * Wait for all the threads to become inactive, so that
408 		 * all the thread context (extended register state, like
409 		 * fpu etc) gets copied to the memory.
410 		 */
411 		ptr = core_state->dumper.next;
412 		while (ptr != NULL) {
413 			wait_task_inactive(ptr->task, 0);
414 			ptr = ptr->next;
415 		}
416 	}
417 
418 	return core_waiters;
419 }
420 
421 static void coredump_finish(bool core_dumped)
422 {
423 	struct core_thread *curr, *next;
424 	struct task_struct *task;
425 
426 	spin_lock_irq(&current->sighand->siglock);
427 	if (core_dumped && !__fatal_signal_pending(current))
428 		current->signal->group_exit_code |= 0x80;
429 	next = current->signal->core_state->dumper.next;
430 	current->signal->core_state = NULL;
431 	spin_unlock_irq(&current->sighand->siglock);
432 
433 	while ((curr = next) != NULL) {
434 		next = curr->next;
435 		task = curr->task;
436 		/*
437 		 * see coredump_task_exit(), curr->task must not see
438 		 * ->task == NULL before we read ->next.
439 		 */
440 		smp_mb();
441 		curr->task = NULL;
442 		wake_up_process(task);
443 	}
444 }
445 
446 static bool dump_interrupted(void)
447 {
448 	/*
449 	 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
450 	 * can do try_to_freeze() and check __fatal_signal_pending(),
451 	 * but then we need to teach dump_write() to restart and clear
452 	 * TIF_SIGPENDING.
453 	 */
454 	return fatal_signal_pending(current) || freezing(current);
455 }
456 
457 static void wait_for_dump_helpers(struct file *file)
458 {
459 	struct pipe_inode_info *pipe = file->private_data;
460 
461 	pipe_lock(pipe);
462 	pipe->readers++;
463 	pipe->writers--;
464 	wake_up_interruptible_sync(&pipe->rd_wait);
465 	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
466 	pipe_unlock(pipe);
467 
468 	/*
469 	 * We actually want wait_event_freezable() but then we need
470 	 * to clear TIF_SIGPENDING and improve dump_interrupted().
471 	 */
472 	wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);
473 
474 	pipe_lock(pipe);
475 	pipe->readers--;
476 	pipe->writers++;
477 	pipe_unlock(pipe);
478 }
479 
480 /*
481  * umh_pipe_setup
482  * helper function to customize the process used
483  * to collect the core in userspace.  Specifically
484  * it sets up a pipe and installs it as fd 0 (stdin)
485  * for the process.  Returns 0 on success, or
486  * PTR_ERR on failure.
487  * Note that it also sets the core limit to 1.  This
488  * is a special value that we use to trap recursive
489  * core dumps
490  */
491 static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
492 {
493 	struct file *files[2];
494 	struct coredump_params *cp = (struct coredump_params *)info->data;
495 	int err = create_pipe_files(files, 0);
496 	if (err)
497 		return err;
498 
499 	cp->file = files[1];
500 
501 	err = replace_fd(0, files[0], 0);
502 	fput(files[0]);
503 	/* and disallow core files too */
504 	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
505 
506 	return err;
507 }
508 
509 void do_coredump(const kernel_siginfo_t *siginfo)
510 {
511 	struct core_state core_state;
512 	struct core_name cn;
513 	struct mm_struct *mm = current->mm;
514 	struct linux_binfmt * binfmt;
515 	const struct cred *old_cred;
516 	struct cred *cred;
517 	int retval = 0;
518 	int ispipe;
519 	size_t *argv = NULL;
520 	int argc = 0;
521 	/* require nonrelative corefile path and be extra careful */
522 	bool need_suid_safe = false;
523 	bool core_dumped = false;
524 	static atomic_t core_dump_count = ATOMIC_INIT(0);
525 	struct coredump_params cprm = {
526 		.siginfo = siginfo,
527 		.regs = signal_pt_regs(),
528 		.limit = rlimit(RLIMIT_CORE),
529 		/*
530 		 * We must use the same mm->flags while dumping core to avoid
531 		 * inconsistency of bit flags, since this flag is not protected
532 		 * by any locks.
533 		 */
534 		.mm_flags = mm->flags,
535 	};
536 
537 	audit_core_dumps(siginfo->si_signo);
538 
539 	binfmt = mm->binfmt;
540 	if (!binfmt || !binfmt->core_dump)
541 		goto fail;
542 	if (!__get_dumpable(cprm.mm_flags))
543 		goto fail;
544 
545 	cred = prepare_creds();
546 	if (!cred)
547 		goto fail;
548 	/*
549 	 * We cannot trust fsuid as being the "true" uid of the process
550 	 * nor do we know its entire history. We only know it was tainted
551 	 * so we dump it as root in mode 2, and only into a controlled
552 	 * environment (pipe handler or fully qualified path).
553 	 */
554 	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
555 		/* Setuid core dump mode */
556 		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
557 		need_suid_safe = true;
558 	}
559 
560 	retval = coredump_wait(siginfo->si_signo, &core_state);
561 	if (retval < 0)
562 		goto fail_creds;
563 
564 	old_cred = override_creds(cred);
565 
566 	ispipe = format_corename(&cn, &cprm, &argv, &argc);
567 
568 	if (ispipe) {
569 		int argi;
570 		int dump_count;
571 		char **helper_argv;
572 		struct subprocess_info *sub_info;
573 
574 		if (ispipe < 0) {
575 			printk(KERN_WARNING "format_corename failed\n");
576 			printk(KERN_WARNING "Aborting core\n");
577 			goto fail_unlock;
578 		}
579 
580 		if (cprm.limit == 1) {
581 			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
582 			 *
583 			 * Normally core limits are irrelevant to pipes, since
584 			 * we're not writing to the file system, but we use
585 			 * cprm.limit of 1 here as a special value, this is a
586 			 * consistent way to catch recursive crashes.
587 			 * We can still crash if the core_pattern binary sets
588 			 * RLIM_CORE = !1, but it runs as root, and can do
589 			 * lots of stupid things.
590 			 *
591 			 * Note that we use task_tgid_vnr here to grab the pid
592 			 * of the process group leader.  That way we get the
593 			 * right pid if a thread in a multi-threaded
594 			 * core_pattern process dies.
595 			 */
596 			printk(KERN_WARNING
597 				"Process %d(%s) has RLIMIT_CORE set to 1\n",
598 				task_tgid_vnr(current), current->comm);
599 			printk(KERN_WARNING "Aborting core\n");
600 			goto fail_unlock;
601 		}
602 		cprm.limit = RLIM_INFINITY;
603 
604 		dump_count = atomic_inc_return(&core_dump_count);
605 		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
606 			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
607 			       task_tgid_vnr(current), current->comm);
608 			printk(KERN_WARNING "Skipping core dump\n");
609 			goto fail_dropcount;
610 		}
611 
612 		helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
613 					    GFP_KERNEL);
614 		if (!helper_argv) {
615 			printk(KERN_WARNING "%s failed to allocate memory\n",
616 			       __func__);
617 			goto fail_dropcount;
618 		}
619 		for (argi = 0; argi < argc; argi++)
620 			helper_argv[argi] = cn.corename + argv[argi];
621 		helper_argv[argi] = NULL;
622 
623 		retval = -ENOMEM;
624 		sub_info = call_usermodehelper_setup(helper_argv[0],
625 						helper_argv, NULL, GFP_KERNEL,
626 						umh_pipe_setup, NULL, &cprm);
627 		if (sub_info)
628 			retval = call_usermodehelper_exec(sub_info,
629 							  UMH_WAIT_EXEC);
630 
631 		kfree(helper_argv);
632 		if (retval) {
633 			printk(KERN_INFO "Core dump to |%s pipe failed\n",
634 			       cn.corename);
635 			goto close_fail;
636 		}
637 	} else {
638 		struct user_namespace *mnt_userns;
639 		struct inode *inode;
640 		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
641 				 O_LARGEFILE | O_EXCL;
642 
643 		if (cprm.limit < binfmt->min_coredump)
644 			goto fail_unlock;
645 
646 		if (need_suid_safe && cn.corename[0] != '/') {
647 			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
648 				"to fully qualified path!\n",
649 				task_tgid_vnr(current), current->comm);
650 			printk(KERN_WARNING "Skipping core dump\n");
651 			goto fail_unlock;
652 		}
653 
654 		/*
655 		 * Unlink the file if it exists unless this is a SUID
656 		 * binary - in that case, we're running around with root
657 		 * privs and don't want to unlink another user's coredump.
658 		 */
659 		if (!need_suid_safe) {
660 			/*
661 			 * If it doesn't exist, that's fine. If there's some
662 			 * other problem, we'll catch it at the filp_open().
663 			 */
664 			do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
665 		}
666 
667 		/*
668 		 * There is a race between unlinking and creating the
669 		 * file, but if that causes an EEXIST here, that's
670 		 * fine - another process raced with us while creating
671 		 * the corefile, and the other process won. To userspace,
672 		 * what matters is that at least one of the two processes
673 		 * writes its coredump successfully, not which one.
674 		 */
675 		if (need_suid_safe) {
676 			/*
677 			 * Using user namespaces, normal user tasks can change
678 			 * their current->fs->root to point to arbitrary
679 			 * directories. Since the intention of the "only dump
680 			 * with a fully qualified path" rule is to control where
681 			 * coredumps may be placed using root privileges,
682 			 * current->fs->root must not be used. Instead, use the
683 			 * root directory of init_task.
684 			 */
685 			struct path root;
686 
687 			task_lock(&init_task);
688 			get_fs_root(init_task.fs, &root);
689 			task_unlock(&init_task);
690 			cprm.file = file_open_root(&root, cn.corename,
691 						   open_flags, 0600);
692 			path_put(&root);
693 		} else {
694 			cprm.file = filp_open(cn.corename, open_flags, 0600);
695 		}
696 		if (IS_ERR(cprm.file))
697 			goto fail_unlock;
698 
699 		inode = file_inode(cprm.file);
700 		if (inode->i_nlink > 1)
701 			goto close_fail;
702 		if (d_unhashed(cprm.file->f_path.dentry))
703 			goto close_fail;
704 		/*
705 		 * AK: actually i see no reason to not allow this for named
706 		 * pipes etc, but keep the previous behaviour for now.
707 		 */
708 		if (!S_ISREG(inode->i_mode))
709 			goto close_fail;
710 		/*
711 		 * Don't dump core if the filesystem changed owner or mode
712 		 * of the file during file creation. This is an issue when
713 		 * a process dumps core while its cwd is e.g. on a vfat
714 		 * filesystem.
715 		 */
716 		mnt_userns = file_mnt_user_ns(cprm.file);
717 		if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
718 			    current_fsuid())) {
719 			pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
720 					    cn.corename);
721 			goto close_fail;
722 		}
723 		if ((inode->i_mode & 0677) != 0600) {
724 			pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n",
725 					    cn.corename);
726 			goto close_fail;
727 		}
728 		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
729 			goto close_fail;
730 		if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
731 				0, 0, cprm.file))
732 			goto close_fail;
733 	}
734 
735 	/* get us an unshared descriptor table; almost always a no-op */
736 	/* The cell spufs coredump code reads the file descriptor tables */
737 	retval = unshare_files();
738 	if (retval)
739 		goto close_fail;
740 	if (!dump_interrupted()) {
741 		/*
742 		 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
743 		 * have this set to NULL.
744 		 */
745 		if (!cprm.file) {
746 			pr_info("Core dump to |%s disabled\n", cn.corename);
747 			goto close_fail;
748 		}
749 		file_start_write(cprm.file);
750 		core_dumped = binfmt->core_dump(&cprm);
751 		/*
752 		 * Ensures that file size is big enough to contain the current
753 		 * file postion. This prevents gdb from complaining about
754 		 * a truncated file if the last "write" to the file was
755 		 * dump_skip.
756 		 */
757 		if (cprm.to_skip) {
758 			cprm.to_skip--;
759 			dump_emit(&cprm, "", 1);
760 		}
761 		file_end_write(cprm.file);
762 	}
763 	if (ispipe && core_pipe_limit)
764 		wait_for_dump_helpers(cprm.file);
765 close_fail:
766 	if (cprm.file)
767 		filp_close(cprm.file, NULL);
768 fail_dropcount:
769 	if (ispipe)
770 		atomic_dec(&core_dump_count);
771 fail_unlock:
772 	kfree(argv);
773 	kfree(cn.corename);
774 	coredump_finish(core_dumped);
775 	revert_creds(old_cred);
776 fail_creds:
777 	put_cred(cred);
778 fail:
779 	return;
780 }
781 
782 /*
783  * Core dumping helper functions.  These are the only things you should
784  * do on a core-file: use only these functions to write out all the
785  * necessary info.
786  */
787 static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
788 {
789 	struct file *file = cprm->file;
790 	loff_t pos = file->f_pos;
791 	ssize_t n;
792 	if (cprm->written + nr > cprm->limit)
793 		return 0;
794 
795 
796 	if (dump_interrupted())
797 		return 0;
798 	n = __kernel_write(file, addr, nr, &pos);
799 	if (n != nr)
800 		return 0;
801 	file->f_pos = pos;
802 	cprm->written += n;
803 	cprm->pos += n;
804 
805 	return 1;
806 }
807 
808 static int __dump_skip(struct coredump_params *cprm, size_t nr)
809 {
810 	static char zeroes[PAGE_SIZE];
811 	struct file *file = cprm->file;
812 	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
813 		if (dump_interrupted() ||
814 		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
815 			return 0;
816 		cprm->pos += nr;
817 		return 1;
818 	} else {
819 		while (nr > PAGE_SIZE) {
820 			if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
821 				return 0;
822 			nr -= PAGE_SIZE;
823 		}
824 		return __dump_emit(cprm, zeroes, nr);
825 	}
826 }
827 
828 int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
829 {
830 	if (cprm->to_skip) {
831 		if (!__dump_skip(cprm, cprm->to_skip))
832 			return 0;
833 		cprm->to_skip = 0;
834 	}
835 	return __dump_emit(cprm, addr, nr);
836 }
837 EXPORT_SYMBOL(dump_emit);
838 
839 void dump_skip_to(struct coredump_params *cprm, unsigned long pos)
840 {
841 	cprm->to_skip = pos - cprm->pos;
842 }
843 EXPORT_SYMBOL(dump_skip_to);
844 
845 void dump_skip(struct coredump_params *cprm, size_t nr)
846 {
847 	cprm->to_skip += nr;
848 }
849 EXPORT_SYMBOL(dump_skip);
850 
851 #ifdef CONFIG_ELF_CORE
852 int dump_user_range(struct coredump_params *cprm, unsigned long start,
853 		    unsigned long len)
854 {
855 	unsigned long addr;
856 
857 	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
858 		struct page *page;
859 		int stop;
860 
861 		/*
862 		 * To avoid having to allocate page tables for virtual address
863 		 * ranges that have never been used yet, and also to make it
864 		 * easy to generate sparse core files, use a helper that returns
865 		 * NULL when encountering an empty page table entry that would
866 		 * otherwise have been filled with the zero page.
867 		 */
868 		page = get_dump_page(addr);
869 		if (page) {
870 			void *kaddr = kmap_local_page(page);
871 
872 			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
873 			kunmap_local(kaddr);
874 			put_page(page);
875 			if (stop)
876 				return 0;
877 		} else {
878 			dump_skip(cprm, PAGE_SIZE);
879 		}
880 	}
881 	return 1;
882 }
883 #endif
884 
885 int dump_align(struct coredump_params *cprm, int align)
886 {
887 	unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1);
888 	if (align & (align - 1))
889 		return 0;
890 	if (mod)
891 		cprm->to_skip += align - mod;
892 	return 1;
893 }
894 EXPORT_SYMBOL(dump_align);
895 
896 #ifdef CONFIG_SYSCTL
897 
898 void validate_coredump_safety(void)
899 {
900 	if (suid_dumpable == SUID_DUMP_ROOT &&
901 	    core_pattern[0] != '/' && core_pattern[0] != '|') {
902 		pr_warn(
903 "Unsafe core_pattern used with fs.suid_dumpable=2.\n"
904 "Pipe handler or fully qualified core dump path required.\n"
905 "Set kernel.core_pattern before fs.suid_dumpable.\n"
906 		);
907 	}
908 }
909 
910 static int proc_dostring_coredump(struct ctl_table *table, int write,
911 		  void *buffer, size_t *lenp, loff_t *ppos)
912 {
913 	int error = proc_dostring(table, write, buffer, lenp, ppos);
914 
915 	if (!error)
916 		validate_coredump_safety();
917 	return error;
918 }
919 
920 static struct ctl_table coredump_sysctls[] = {
921 	{
922 		.procname	= "core_uses_pid",
923 		.data		= &core_uses_pid,
924 		.maxlen		= sizeof(int),
925 		.mode		= 0644,
926 		.proc_handler	= proc_dointvec,
927 	},
928 	{
929 		.procname	= "core_pattern",
930 		.data		= core_pattern,
931 		.maxlen		= CORENAME_MAX_SIZE,
932 		.mode		= 0644,
933 		.proc_handler	= proc_dostring_coredump,
934 	},
935 	{
936 		.procname	= "core_pipe_limit",
937 		.data		= &core_pipe_limit,
938 		.maxlen		= sizeof(unsigned int),
939 		.mode		= 0644,
940 		.proc_handler	= proc_dointvec,
941 	},
942 	{ }
943 };
944 
945 static int __init init_fs_coredump_sysctls(void)
946 {
947 	register_sysctl_init("kernel", coredump_sysctls);
948 	return 0;
949 }
950 fs_initcall(init_fs_coredump_sysctls);
951 #endif /* CONFIG_SYSCTL */
952 
953 /*
954  * The purpose of always_dump_vma() is to make sure that special kernel mappings
955  * that are useful for post-mortem analysis are included in every core dump.
956  * In that way we ensure that the core dump is fully interpretable later
957  * without matching up the same kernel and hardware config to see what PC values
958  * meant. These special mappings include - vDSO, vsyscall, and other
959  * architecture specific mappings
960  */
961 static bool always_dump_vma(struct vm_area_struct *vma)
962 {
963 	/* Any vsyscall mappings? */
964 	if (vma == get_gate_vma(vma->vm_mm))
965 		return true;
966 
967 	/*
968 	 * Assume that all vmas with a .name op should always be dumped.
969 	 * If this changes, a new vm_ops field can easily be added.
970 	 */
971 	if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
972 		return true;
973 
974 	/*
975 	 * arch_vma_name() returns non-NULL for special architecture mappings,
976 	 * such as vDSO sections.
977 	 */
978 	if (arch_vma_name(vma))
979 		return true;
980 
981 	return false;
982 }
983 
984 #define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
985 
986 /*
987  * Decide how much of @vma's contents should be included in a core dump.
988  */
989 static unsigned long vma_dump_size(struct vm_area_struct *vma,
990 				   unsigned long mm_flags)
991 {
992 #define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
993 
994 	/* always dump the vdso and vsyscall sections */
995 	if (always_dump_vma(vma))
996 		goto whole;
997 
998 	if (vma->vm_flags & VM_DONTDUMP)
999 		return 0;
1000 
1001 	/* support for DAX */
1002 	if (vma_is_dax(vma)) {
1003 		if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
1004 			goto whole;
1005 		if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
1006 			goto whole;
1007 		return 0;
1008 	}
1009 
1010 	/* Hugetlb memory check */
1011 	if (is_vm_hugetlb_page(vma)) {
1012 		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
1013 			goto whole;
1014 		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
1015 			goto whole;
1016 		return 0;
1017 	}
1018 
1019 	/* Do not dump I/O mapped devices or special mappings */
1020 	if (vma->vm_flags & VM_IO)
1021 		return 0;
1022 
1023 	/* By default, dump shared memory if mapped from an anonymous file. */
1024 	if (vma->vm_flags & VM_SHARED) {
1025 		if (file_inode(vma->vm_file)->i_nlink == 0 ?
1026 		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
1027 			goto whole;
1028 		return 0;
1029 	}
1030 
1031 	/* Dump segments that have been written to.  */
1032 	if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE))
1033 		goto whole;
1034 	if (vma->vm_file == NULL)
1035 		return 0;
1036 
1037 	if (FILTER(MAPPED_PRIVATE))
1038 		goto whole;
1039 
1040 	/*
1041 	 * If this is the beginning of an executable file mapping,
1042 	 * dump the first page to aid in determining what was mapped here.
1043 	 */
1044 	if (FILTER(ELF_HEADERS) &&
1045 	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1046 		if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
1047 			return PAGE_SIZE;
1048 
1049 		/*
1050 		 * ELF libraries aren't always executable.
1051 		 * We'll want to check whether the mapping starts with the ELF
1052 		 * magic, but not now - we're holding the mmap lock,
1053 		 * so copy_from_user() doesn't work here.
1054 		 * Use a placeholder instead, and fix it up later in
1055 		 * dump_vma_snapshot().
1056 		 */
1057 		return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
1058 	}
1059 
1060 #undef	FILTER
1061 
1062 	return 0;
1063 
1064 whole:
1065 	return vma->vm_end - vma->vm_start;
1066 }
1067 
1068 static struct vm_area_struct *first_vma(struct task_struct *tsk,
1069 					struct vm_area_struct *gate_vma)
1070 {
1071 	struct vm_area_struct *ret = tsk->mm->mmap;
1072 
1073 	if (ret)
1074 		return ret;
1075 	return gate_vma;
1076 }
1077 
1078 /*
1079  * Helper function for iterating across a vma list.  It ensures that the caller
1080  * will visit `gate_vma' prior to terminating the search.
1081  */
1082 static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1083 				       struct vm_area_struct *gate_vma)
1084 {
1085 	struct vm_area_struct *ret;
1086 
1087 	ret = this_vma->vm_next;
1088 	if (ret)
1089 		return ret;
1090 	if (this_vma == gate_vma)
1091 		return NULL;
1092 	return gate_vma;
1093 }
1094 
1095 /*
1096  * Under the mmap_lock, take a snapshot of relevant information about the task's
1097  * VMAs.
1098  */
1099 int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
1100 		      struct core_vma_metadata **vma_meta,
1101 		      size_t *vma_data_size_ptr)
1102 {
1103 	struct vm_area_struct *vma, *gate_vma;
1104 	struct mm_struct *mm = current->mm;
1105 	int i;
1106 	size_t vma_data_size = 0;
1107 
1108 	/*
1109 	 * Once the stack expansion code is fixed to not change VMA bounds
1110 	 * under mmap_lock in read mode, this can be changed to take the
1111 	 * mmap_lock in read mode.
1112 	 */
1113 	if (mmap_write_lock_killable(mm))
1114 		return -EINTR;
1115 
1116 	gate_vma = get_gate_vma(mm);
1117 	*vma_count = mm->map_count + (gate_vma ? 1 : 0);
1118 
1119 	*vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
1120 	if (!*vma_meta) {
1121 		mmap_write_unlock(mm);
1122 		return -ENOMEM;
1123 	}
1124 
1125 	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
1126 			vma = next_vma(vma, gate_vma), i++) {
1127 		struct core_vma_metadata *m = (*vma_meta) + i;
1128 
1129 		m->start = vma->vm_start;
1130 		m->end = vma->vm_end;
1131 		m->flags = vma->vm_flags;
1132 		m->dump_size = vma_dump_size(vma, cprm->mm_flags);
1133 	}
1134 
1135 	mmap_write_unlock(mm);
1136 
1137 	if (WARN_ON(i != *vma_count)) {
1138 		kvfree(*vma_meta);
1139 		return -EFAULT;
1140 	}
1141 
1142 	for (i = 0; i < *vma_count; i++) {
1143 		struct core_vma_metadata *m = (*vma_meta) + i;
1144 
1145 		if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
1146 			char elfmag[SELFMAG];
1147 
1148 			if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
1149 					memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
1150 				m->dump_size = 0;
1151 			} else {
1152 				m->dump_size = PAGE_SIZE;
1153 			}
1154 		}
1155 
1156 		vma_data_size += m->dump_size;
1157 	}
1158 
1159 	*vma_data_size_ptr = vma_data_size;
1160 	return 0;
1161 }
1162