1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/kernel/exit.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/slab.h> 10 #include <linux/sched/autogroup.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/stat.h> 13 #include <linux/sched/task.h> 14 #include <linux/sched/task_stack.h> 15 #include <linux/sched/cputime.h> 16 #include <linux/interrupt.h> 17 #include <linux/module.h> 18 #include <linux/capability.h> 19 #include <linux/completion.h> 20 #include <linux/personality.h> 21 #include <linux/tty.h> 22 #include <linux/iocontext.h> 23 #include <linux/key.h> 24 #include <linux/cpu.h> 25 #include <linux/acct.h> 26 #include <linux/tsacct_kern.h> 27 #include <linux/file.h> 28 #include <linux/fdtable.h> 29 #include <linux/freezer.h> 30 #include <linux/binfmts.h> 31 #include <linux/nsproxy.h> 32 #include <linux/pid_namespace.h> 33 #include <linux/ptrace.h> 34 #include <linux/profile.h> 35 #include <linux/mount.h> 36 #include <linux/proc_fs.h> 37 #include <linux/kthread.h> 38 #include <linux/mempolicy.h> 39 #include <linux/taskstats_kern.h> 40 #include <linux/delayacct.h> 41 #include <linux/cgroup.h> 42 #include <linux/syscalls.h> 43 #include <linux/signal.h> 44 #include <linux/posix-timers.h> 45 #include <linux/cn_proc.h> 46 #include <linux/mutex.h> 47 #include <linux/futex.h> 48 #include <linux/pipe_fs_i.h> 49 #include <linux/audit.h> /* for audit_free() */ 50 #include <linux/resource.h> 51 #include <linux/task_io_accounting_ops.h> 52 #include <linux/blkdev.h> 53 #include <linux/task_work.h> 54 #include <linux/fs_struct.h> 55 #include <linux/init_task.h> 56 #include <linux/perf_event.h> 57 #include <trace/events/sched.h> 58 #include <linux/hw_breakpoint.h> 59 #include <linux/oom.h> 60 #include <linux/writeback.h> 61 #include <linux/shm.h> 62 #include <linux/kcov.h> 63 #include <linux/kmsan.h> 64 #include <linux/random.h> 65 #include <linux/rcuwait.h> 66 #include <linux/compat.h> 67 #include <linux/io_uring.h> 68 #include <linux/kprobes.h> 69 #include <linux/rethook.h> 70 #include <linux/sysfs.h> 71 72 #include <linux/uaccess.h> 73 #include <asm/unistd.h> 74 #include <asm/mmu_context.h> 75 76 /* 77 * The default value should be high enough to not crash a system that randomly 78 * crashes its kernel from time to time, but low enough to at least not permit 79 * overflowing 32-bit refcounts or the ldsem writer count. 80 */ 81 static unsigned int oops_limit = 10000; 82 83 #ifdef CONFIG_SYSCTL 84 static struct ctl_table kern_exit_table[] = { 85 { 86 .procname = "oops_limit", 87 .data = &oops_limit, 88 .maxlen = sizeof(oops_limit), 89 .mode = 0644, 90 .proc_handler = proc_douintvec, 91 }, 92 { } 93 }; 94 95 static __init int kernel_exit_sysctls_init(void) 96 { 97 register_sysctl_init("kernel", kern_exit_table); 98 return 0; 99 } 100 late_initcall(kernel_exit_sysctls_init); 101 #endif 102 103 static atomic_t oops_count = ATOMIC_INIT(0); 104 105 #ifdef CONFIG_SYSFS 106 static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, 107 char *page) 108 { 109 return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); 110 } 111 112 static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); 113 114 static __init int kernel_exit_sysfs_init(void) 115 { 116 sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); 117 return 0; 118 } 119 late_initcall(kernel_exit_sysfs_init); 120 #endif 121 122 static void __unhash_process(struct task_struct *p, bool group_dead) 123 { 124 nr_threads--; 125 detach_pid(p, PIDTYPE_PID); 126 if (group_dead) { 127 detach_pid(p, PIDTYPE_TGID); 128 detach_pid(p, PIDTYPE_PGID); 129 detach_pid(p, PIDTYPE_SID); 130 131 list_del_rcu(&p->tasks); 132 list_del_init(&p->sibling); 133 __this_cpu_dec(process_counts); 134 } 135 list_del_rcu(&p->thread_group); 136 list_del_rcu(&p->thread_node); 137 } 138 139 /* 140 * This function expects the tasklist_lock write-locked. 141 */ 142 static void __exit_signal(struct task_struct *tsk) 143 { 144 struct signal_struct *sig = tsk->signal; 145 bool group_dead = thread_group_leader(tsk); 146 struct sighand_struct *sighand; 147 struct tty_struct *tty; 148 u64 utime, stime; 149 150 sighand = rcu_dereference_check(tsk->sighand, 151 lockdep_tasklist_lock_is_held()); 152 spin_lock(&sighand->siglock); 153 154 #ifdef CONFIG_POSIX_TIMERS 155 posix_cpu_timers_exit(tsk); 156 if (group_dead) 157 posix_cpu_timers_exit_group(tsk); 158 #endif 159 160 if (group_dead) { 161 tty = sig->tty; 162 sig->tty = NULL; 163 } else { 164 /* 165 * If there is any task waiting for the group exit 166 * then notify it: 167 */ 168 if (sig->notify_count > 0 && !--sig->notify_count) 169 wake_up_process(sig->group_exec_task); 170 171 if (tsk == sig->curr_target) 172 sig->curr_target = next_thread(tsk); 173 } 174 175 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 176 sizeof(unsigned long long)); 177 178 /* 179 * Accumulate here the counters for all threads as they die. We could 180 * skip the group leader because it is the last user of signal_struct, 181 * but we want to avoid the race with thread_group_cputime() which can 182 * see the empty ->thread_head list. 183 */ 184 task_cputime(tsk, &utime, &stime); 185 write_seqlock(&sig->stats_lock); 186 sig->utime += utime; 187 sig->stime += stime; 188 sig->gtime += task_gtime(tsk); 189 sig->min_flt += tsk->min_flt; 190 sig->maj_flt += tsk->maj_flt; 191 sig->nvcsw += tsk->nvcsw; 192 sig->nivcsw += tsk->nivcsw; 193 sig->inblock += task_io_get_inblock(tsk); 194 sig->oublock += task_io_get_oublock(tsk); 195 task_io_accounting_add(&sig->ioac, &tsk->ioac); 196 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 197 sig->nr_threads--; 198 __unhash_process(tsk, group_dead); 199 write_sequnlock(&sig->stats_lock); 200 201 /* 202 * Do this under ->siglock, we can race with another thread 203 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 204 */ 205 flush_sigqueue(&tsk->pending); 206 tsk->sighand = NULL; 207 spin_unlock(&sighand->siglock); 208 209 __cleanup_sighand(sighand); 210 clear_tsk_thread_flag(tsk, TIF_SIGPENDING); 211 if (group_dead) { 212 flush_sigqueue(&sig->shared_pending); 213 tty_kref_put(tty); 214 } 215 } 216 217 static void delayed_put_task_struct(struct rcu_head *rhp) 218 { 219 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 220 221 kprobe_flush_task(tsk); 222 rethook_flush_task(tsk); 223 perf_event_delayed_put(tsk); 224 trace_sched_process_free(tsk); 225 put_task_struct(tsk); 226 } 227 228 void put_task_struct_rcu_user(struct task_struct *task) 229 { 230 if (refcount_dec_and_test(&task->rcu_users)) 231 call_rcu(&task->rcu, delayed_put_task_struct); 232 } 233 234 void __weak release_thread(struct task_struct *dead_task) 235 { 236 } 237 238 void release_task(struct task_struct *p) 239 { 240 struct task_struct *leader; 241 struct pid *thread_pid; 242 int zap_leader; 243 repeat: 244 /* don't need to get the RCU readlock here - the process is dead and 245 * can't be modifying its own credentials. But shut RCU-lockdep up */ 246 rcu_read_lock(); 247 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 248 rcu_read_unlock(); 249 250 cgroup_release(p); 251 252 write_lock_irq(&tasklist_lock); 253 ptrace_release_task(p); 254 thread_pid = get_pid(p->thread_pid); 255 __exit_signal(p); 256 257 /* 258 * If we are the last non-leader member of the thread 259 * group, and the leader is zombie, then notify the 260 * group leader's parent process. (if it wants notification.) 261 */ 262 zap_leader = 0; 263 leader = p->group_leader; 264 if (leader != p && thread_group_empty(leader) 265 && leader->exit_state == EXIT_ZOMBIE) { 266 /* 267 * If we were the last child thread and the leader has 268 * exited already, and the leader's parent ignores SIGCHLD, 269 * then we are the one who should release the leader. 270 */ 271 zap_leader = do_notify_parent(leader, leader->exit_signal); 272 if (zap_leader) 273 leader->exit_state = EXIT_DEAD; 274 } 275 276 write_unlock_irq(&tasklist_lock); 277 seccomp_filter_release(p); 278 proc_flush_pid(thread_pid); 279 put_pid(thread_pid); 280 release_thread(p); 281 put_task_struct_rcu_user(p); 282 283 p = leader; 284 if (unlikely(zap_leader)) 285 goto repeat; 286 } 287 288 int rcuwait_wake_up(struct rcuwait *w) 289 { 290 int ret = 0; 291 struct task_struct *task; 292 293 rcu_read_lock(); 294 295 /* 296 * Order condition vs @task, such that everything prior to the load 297 * of @task is visible. This is the condition as to why the user called 298 * rcuwait_wake() in the first place. Pairs with set_current_state() 299 * barrier (A) in rcuwait_wait_event(). 300 * 301 * WAIT WAKE 302 * [S] tsk = current [S] cond = true 303 * MB (A) MB (B) 304 * [L] cond [L] tsk 305 */ 306 smp_mb(); /* (B) */ 307 308 task = rcu_dereference(w->task); 309 if (task) 310 ret = wake_up_process(task); 311 rcu_read_unlock(); 312 313 return ret; 314 } 315 EXPORT_SYMBOL_GPL(rcuwait_wake_up); 316 317 /* 318 * Determine if a process group is "orphaned", according to the POSIX 319 * definition in 2.2.2.52. Orphaned process groups are not to be affected 320 * by terminal-generated stop signals. Newly orphaned process groups are 321 * to receive a SIGHUP and a SIGCONT. 322 * 323 * "I ask you, have you ever known what it is to be an orphan?" 324 */ 325 static int will_become_orphaned_pgrp(struct pid *pgrp, 326 struct task_struct *ignored_task) 327 { 328 struct task_struct *p; 329 330 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 331 if ((p == ignored_task) || 332 (p->exit_state && thread_group_empty(p)) || 333 is_global_init(p->real_parent)) 334 continue; 335 336 if (task_pgrp(p->real_parent) != pgrp && 337 task_session(p->real_parent) == task_session(p)) 338 return 0; 339 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 340 341 return 1; 342 } 343 344 int is_current_pgrp_orphaned(void) 345 { 346 int retval; 347 348 read_lock(&tasklist_lock); 349 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); 350 read_unlock(&tasklist_lock); 351 352 return retval; 353 } 354 355 static bool has_stopped_jobs(struct pid *pgrp) 356 { 357 struct task_struct *p; 358 359 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 360 if (p->signal->flags & SIGNAL_STOP_STOPPED) 361 return true; 362 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 363 364 return false; 365 } 366 367 /* 368 * Check to see if any process groups have become orphaned as 369 * a result of our exiting, and if they have any stopped jobs, 370 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 371 */ 372 static void 373 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) 374 { 375 struct pid *pgrp = task_pgrp(tsk); 376 struct task_struct *ignored_task = tsk; 377 378 if (!parent) 379 /* exit: our father is in a different pgrp than 380 * we are and we were the only connection outside. 381 */ 382 parent = tsk->real_parent; 383 else 384 /* reparent: our child is in a different pgrp than 385 * we are, and it was the only connection outside. 386 */ 387 ignored_task = NULL; 388 389 if (task_pgrp(parent) != pgrp && 390 task_session(parent) == task_session(tsk) && 391 will_become_orphaned_pgrp(pgrp, ignored_task) && 392 has_stopped_jobs(pgrp)) { 393 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); 394 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); 395 } 396 } 397 398 static void coredump_task_exit(struct task_struct *tsk) 399 { 400 struct core_state *core_state; 401 402 /* 403 * Serialize with any possible pending coredump. 404 * We must hold siglock around checking core_state 405 * and setting PF_POSTCOREDUMP. The core-inducing thread 406 * will increment ->nr_threads for each thread in the 407 * group without PF_POSTCOREDUMP set. 408 */ 409 spin_lock_irq(&tsk->sighand->siglock); 410 tsk->flags |= PF_POSTCOREDUMP; 411 core_state = tsk->signal->core_state; 412 spin_unlock_irq(&tsk->sighand->siglock); 413 if (core_state) { 414 struct core_thread self; 415 416 self.task = current; 417 if (self.task->flags & PF_SIGNALED) 418 self.next = xchg(&core_state->dumper.next, &self); 419 else 420 self.task = NULL; 421 /* 422 * Implies mb(), the result of xchg() must be visible 423 * to core_state->dumper. 424 */ 425 if (atomic_dec_and_test(&core_state->nr_threads)) 426 complete(&core_state->startup); 427 428 for (;;) { 429 set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 430 if (!self.task) /* see coredump_finish() */ 431 break; 432 schedule(); 433 } 434 __set_current_state(TASK_RUNNING); 435 } 436 } 437 438 #ifdef CONFIG_MEMCG 439 /* 440 * A task is exiting. If it owned this mm, find a new owner for the mm. 441 */ 442 void mm_update_next_owner(struct mm_struct *mm) 443 { 444 struct task_struct *c, *g, *p = current; 445 446 retry: 447 /* 448 * If the exiting or execing task is not the owner, it's 449 * someone else's problem. 450 */ 451 if (mm->owner != p) 452 return; 453 /* 454 * The current owner is exiting/execing and there are no other 455 * candidates. Do not leave the mm pointing to a possibly 456 * freed task structure. 457 */ 458 if (atomic_read(&mm->mm_users) <= 1) { 459 WRITE_ONCE(mm->owner, NULL); 460 return; 461 } 462 463 read_lock(&tasklist_lock); 464 /* 465 * Search in the children 466 */ 467 list_for_each_entry(c, &p->children, sibling) { 468 if (c->mm == mm) 469 goto assign_new_owner; 470 } 471 472 /* 473 * Search in the siblings 474 */ 475 list_for_each_entry(c, &p->real_parent->children, sibling) { 476 if (c->mm == mm) 477 goto assign_new_owner; 478 } 479 480 /* 481 * Search through everything else, we should not get here often. 482 */ 483 for_each_process(g) { 484 if (g->flags & PF_KTHREAD) 485 continue; 486 for_each_thread(g, c) { 487 if (c->mm == mm) 488 goto assign_new_owner; 489 if (c->mm) 490 break; 491 } 492 } 493 read_unlock(&tasklist_lock); 494 /* 495 * We found no owner yet mm_users > 1: this implies that we are 496 * most likely racing with swapoff (try_to_unuse()) or /proc or 497 * ptrace or page migration (get_task_mm()). Mark owner as NULL. 498 */ 499 WRITE_ONCE(mm->owner, NULL); 500 return; 501 502 assign_new_owner: 503 BUG_ON(c == p); 504 get_task_struct(c); 505 /* 506 * The task_lock protects c->mm from changing. 507 * We always want mm->owner->mm == mm 508 */ 509 task_lock(c); 510 /* 511 * Delay read_unlock() till we have the task_lock() 512 * to ensure that c does not slip away underneath us 513 */ 514 read_unlock(&tasklist_lock); 515 if (c->mm != mm) { 516 task_unlock(c); 517 put_task_struct(c); 518 goto retry; 519 } 520 WRITE_ONCE(mm->owner, c); 521 lru_gen_migrate_mm(mm); 522 task_unlock(c); 523 put_task_struct(c); 524 } 525 #endif /* CONFIG_MEMCG */ 526 527 /* 528 * Turn us into a lazy TLB process if we 529 * aren't already.. 530 */ 531 static void exit_mm(void) 532 { 533 struct mm_struct *mm = current->mm; 534 535 exit_mm_release(current, mm); 536 if (!mm) 537 return; 538 sync_mm_rss(mm); 539 mmap_read_lock(mm); 540 mmgrab_lazy_tlb(mm); 541 BUG_ON(mm != current->active_mm); 542 /* more a memory barrier than a real lock */ 543 task_lock(current); 544 /* 545 * When a thread stops operating on an address space, the loop 546 * in membarrier_private_expedited() may not observe that 547 * tsk->mm, and the loop in membarrier_global_expedited() may 548 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED 549 * rq->membarrier_state, so those would not issue an IPI. 550 * Membarrier requires a memory barrier after accessing 551 * user-space memory, before clearing tsk->mm or the 552 * rq->membarrier_state. 553 */ 554 smp_mb__after_spinlock(); 555 local_irq_disable(); 556 current->mm = NULL; 557 membarrier_update_current_mm(NULL); 558 enter_lazy_tlb(mm, current); 559 local_irq_enable(); 560 task_unlock(current); 561 mmap_read_unlock(mm); 562 mm_update_next_owner(mm); 563 mmput(mm); 564 if (test_thread_flag(TIF_MEMDIE)) 565 exit_oom_victim(); 566 } 567 568 static struct task_struct *find_alive_thread(struct task_struct *p) 569 { 570 struct task_struct *t; 571 572 for_each_thread(p, t) { 573 if (!(t->flags & PF_EXITING)) 574 return t; 575 } 576 return NULL; 577 } 578 579 static struct task_struct *find_child_reaper(struct task_struct *father, 580 struct list_head *dead) 581 __releases(&tasklist_lock) 582 __acquires(&tasklist_lock) 583 { 584 struct pid_namespace *pid_ns = task_active_pid_ns(father); 585 struct task_struct *reaper = pid_ns->child_reaper; 586 struct task_struct *p, *n; 587 588 if (likely(reaper != father)) 589 return reaper; 590 591 reaper = find_alive_thread(father); 592 if (reaper) { 593 pid_ns->child_reaper = reaper; 594 return reaper; 595 } 596 597 write_unlock_irq(&tasklist_lock); 598 599 list_for_each_entry_safe(p, n, dead, ptrace_entry) { 600 list_del_init(&p->ptrace_entry); 601 release_task(p); 602 } 603 604 zap_pid_ns_processes(pid_ns); 605 write_lock_irq(&tasklist_lock); 606 607 return father; 608 } 609 610 /* 611 * When we die, we re-parent all our children, and try to: 612 * 1. give them to another thread in our thread group, if such a member exists 613 * 2. give it to the first ancestor process which prctl'd itself as a 614 * child_subreaper for its children (like a service manager) 615 * 3. give it to the init process (PID 1) in our pid namespace 616 */ 617 static struct task_struct *find_new_reaper(struct task_struct *father, 618 struct task_struct *child_reaper) 619 { 620 struct task_struct *thread, *reaper; 621 622 thread = find_alive_thread(father); 623 if (thread) 624 return thread; 625 626 if (father->signal->has_child_subreaper) { 627 unsigned int ns_level = task_pid(father)->level; 628 /* 629 * Find the first ->is_child_subreaper ancestor in our pid_ns. 630 * We can't check reaper != child_reaper to ensure we do not 631 * cross the namespaces, the exiting parent could be injected 632 * by setns() + fork(). 633 * We check pid->level, this is slightly more efficient than 634 * task_active_pid_ns(reaper) != task_active_pid_ns(father). 635 */ 636 for (reaper = father->real_parent; 637 task_pid(reaper)->level == ns_level; 638 reaper = reaper->real_parent) { 639 if (reaper == &init_task) 640 break; 641 if (!reaper->signal->is_child_subreaper) 642 continue; 643 thread = find_alive_thread(reaper); 644 if (thread) 645 return thread; 646 } 647 } 648 649 return child_reaper; 650 } 651 652 /* 653 * Any that need to be release_task'd are put on the @dead list. 654 */ 655 static void reparent_leader(struct task_struct *father, struct task_struct *p, 656 struct list_head *dead) 657 { 658 if (unlikely(p->exit_state == EXIT_DEAD)) 659 return; 660 661 /* We don't want people slaying init. */ 662 p->exit_signal = SIGCHLD; 663 664 /* If it has exited notify the new parent about this child's death. */ 665 if (!p->ptrace && 666 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 667 if (do_notify_parent(p, p->exit_signal)) { 668 p->exit_state = EXIT_DEAD; 669 list_add(&p->ptrace_entry, dead); 670 } 671 } 672 673 kill_orphaned_pgrp(p, father); 674 } 675 676 /* 677 * This does two things: 678 * 679 * A. Make init inherit all the child processes 680 * B. Check to see if any process groups have become orphaned 681 * as a result of our exiting, and if they have any stopped 682 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 683 */ 684 static void forget_original_parent(struct task_struct *father, 685 struct list_head *dead) 686 { 687 struct task_struct *p, *t, *reaper; 688 689 if (unlikely(!list_empty(&father->ptraced))) 690 exit_ptrace(father, dead); 691 692 /* Can drop and reacquire tasklist_lock */ 693 reaper = find_child_reaper(father, dead); 694 if (list_empty(&father->children)) 695 return; 696 697 reaper = find_new_reaper(father, reaper); 698 list_for_each_entry(p, &father->children, sibling) { 699 for_each_thread(p, t) { 700 RCU_INIT_POINTER(t->real_parent, reaper); 701 BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); 702 if (likely(!t->ptrace)) 703 t->parent = t->real_parent; 704 if (t->pdeath_signal) 705 group_send_sig_info(t->pdeath_signal, 706 SEND_SIG_NOINFO, t, 707 PIDTYPE_TGID); 708 } 709 /* 710 * If this is a threaded reparent there is no need to 711 * notify anyone anything has happened. 712 */ 713 if (!same_thread_group(reaper, father)) 714 reparent_leader(father, p, dead); 715 } 716 list_splice_tail_init(&father->children, &reaper->children); 717 } 718 719 /* 720 * Send signals to all our closest relatives so that they know 721 * to properly mourn us.. 722 */ 723 static void exit_notify(struct task_struct *tsk, int group_dead) 724 { 725 bool autoreap; 726 struct task_struct *p, *n; 727 LIST_HEAD(dead); 728 729 write_lock_irq(&tasklist_lock); 730 forget_original_parent(tsk, &dead); 731 732 if (group_dead) 733 kill_orphaned_pgrp(tsk->group_leader, NULL); 734 735 tsk->exit_state = EXIT_ZOMBIE; 736 if (unlikely(tsk->ptrace)) { 737 int sig = thread_group_leader(tsk) && 738 thread_group_empty(tsk) && 739 !ptrace_reparented(tsk) ? 740 tsk->exit_signal : SIGCHLD; 741 autoreap = do_notify_parent(tsk, sig); 742 } else if (thread_group_leader(tsk)) { 743 autoreap = thread_group_empty(tsk) && 744 do_notify_parent(tsk, tsk->exit_signal); 745 } else { 746 autoreap = true; 747 } 748 749 if (autoreap) { 750 tsk->exit_state = EXIT_DEAD; 751 list_add(&tsk->ptrace_entry, &dead); 752 } 753 754 /* mt-exec, de_thread() is waiting for group leader */ 755 if (unlikely(tsk->signal->notify_count < 0)) 756 wake_up_process(tsk->signal->group_exec_task); 757 write_unlock_irq(&tasklist_lock); 758 759 list_for_each_entry_safe(p, n, &dead, ptrace_entry) { 760 list_del_init(&p->ptrace_entry); 761 release_task(p); 762 } 763 } 764 765 #ifdef CONFIG_DEBUG_STACK_USAGE 766 static void check_stack_usage(void) 767 { 768 static DEFINE_SPINLOCK(low_water_lock); 769 static int lowest_to_date = THREAD_SIZE; 770 unsigned long free; 771 772 free = stack_not_used(current); 773 774 if (free >= lowest_to_date) 775 return; 776 777 spin_lock(&low_water_lock); 778 if (free < lowest_to_date) { 779 pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", 780 current->comm, task_pid_nr(current), free); 781 lowest_to_date = free; 782 } 783 spin_unlock(&low_water_lock); 784 } 785 #else 786 static inline void check_stack_usage(void) {} 787 #endif 788 789 static void synchronize_group_exit(struct task_struct *tsk, long code) 790 { 791 struct sighand_struct *sighand = tsk->sighand; 792 struct signal_struct *signal = tsk->signal; 793 794 spin_lock_irq(&sighand->siglock); 795 signal->quick_threads--; 796 if ((signal->quick_threads == 0) && 797 !(signal->flags & SIGNAL_GROUP_EXIT)) { 798 signal->flags = SIGNAL_GROUP_EXIT; 799 signal->group_exit_code = code; 800 signal->group_stop_count = 0; 801 } 802 spin_unlock_irq(&sighand->siglock); 803 } 804 805 void __noreturn do_exit(long code) 806 { 807 struct task_struct *tsk = current; 808 int group_dead; 809 810 WARN_ON(irqs_disabled()); 811 812 synchronize_group_exit(tsk, code); 813 814 WARN_ON(tsk->plug); 815 816 kcov_task_exit(tsk); 817 kmsan_task_exit(tsk); 818 819 coredump_task_exit(tsk); 820 ptrace_event(PTRACE_EVENT_EXIT, code); 821 822 validate_creds_for_do_exit(tsk); 823 824 io_uring_files_cancel(); 825 exit_signals(tsk); /* sets PF_EXITING */ 826 827 /* sync mm's RSS info before statistics gathering */ 828 if (tsk->mm) 829 sync_mm_rss(tsk->mm); 830 acct_update_integrals(tsk); 831 group_dead = atomic_dec_and_test(&tsk->signal->live); 832 if (group_dead) { 833 /* 834 * If the last thread of global init has exited, panic 835 * immediately to get a useable coredump. 836 */ 837 if (unlikely(is_global_init(tsk))) 838 panic("Attempted to kill init! exitcode=0x%08x\n", 839 tsk->signal->group_exit_code ?: (int)code); 840 841 #ifdef CONFIG_POSIX_TIMERS 842 hrtimer_cancel(&tsk->signal->real_timer); 843 exit_itimers(tsk); 844 #endif 845 if (tsk->mm) 846 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); 847 } 848 acct_collect(code, group_dead); 849 if (group_dead) 850 tty_audit_exit(); 851 audit_free(tsk); 852 853 tsk->exit_code = code; 854 taskstats_exit(tsk, group_dead); 855 856 exit_mm(); 857 858 if (group_dead) 859 acct_process(); 860 trace_sched_process_exit(tsk); 861 862 exit_sem(tsk); 863 exit_shm(tsk); 864 exit_files(tsk); 865 exit_fs(tsk); 866 if (group_dead) 867 disassociate_ctty(1); 868 exit_task_namespaces(tsk); 869 exit_task_work(tsk); 870 exit_thread(tsk); 871 872 /* 873 * Flush inherited counters to the parent - before the parent 874 * gets woken up by child-exit notifications. 875 * 876 * because of cgroup mode, must be called before cgroup_exit() 877 */ 878 perf_event_exit_task(tsk); 879 880 sched_autogroup_exit_task(tsk); 881 cgroup_exit(tsk); 882 883 /* 884 * FIXME: do that only when needed, using sched_exit tracepoint 885 */ 886 flush_ptrace_hw_breakpoint(tsk); 887 888 exit_tasks_rcu_start(); 889 exit_notify(tsk, group_dead); 890 proc_exit_connector(tsk); 891 mpol_put_task_policy(tsk); 892 #ifdef CONFIG_FUTEX 893 if (unlikely(current->pi_state_cache)) 894 kfree(current->pi_state_cache); 895 #endif 896 /* 897 * Make sure we are holding no locks: 898 */ 899 debug_check_no_locks_held(); 900 901 if (tsk->io_context) 902 exit_io_context(tsk); 903 904 if (tsk->splice_pipe) 905 free_pipe_info(tsk->splice_pipe); 906 907 if (tsk->task_frag.page) 908 put_page(tsk->task_frag.page); 909 910 validate_creds_for_do_exit(tsk); 911 exit_task_stack_account(tsk); 912 913 check_stack_usage(); 914 preempt_disable(); 915 if (tsk->nr_dirtied) 916 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 917 exit_rcu(); 918 exit_tasks_rcu_finish(); 919 920 lockdep_free_task(tsk); 921 do_task_dead(); 922 } 923 924 void __noreturn make_task_dead(int signr) 925 { 926 /* 927 * Take the task off the cpu after something catastrophic has 928 * happened. 929 * 930 * We can get here from a kernel oops, sometimes with preemption off. 931 * Start by checking for critical errors. 932 * Then fix up important state like USER_DS and preemption. 933 * Then do everything else. 934 */ 935 struct task_struct *tsk = current; 936 unsigned int limit; 937 938 if (unlikely(in_interrupt())) 939 panic("Aiee, killing interrupt handler!"); 940 if (unlikely(!tsk->pid)) 941 panic("Attempted to kill the idle task!"); 942 943 if (unlikely(irqs_disabled())) { 944 pr_info("note: %s[%d] exited with irqs disabled\n", 945 current->comm, task_pid_nr(current)); 946 local_irq_enable(); 947 } 948 if (unlikely(in_atomic())) { 949 pr_info("note: %s[%d] exited with preempt_count %d\n", 950 current->comm, task_pid_nr(current), 951 preempt_count()); 952 preempt_count_set(PREEMPT_ENABLED); 953 } 954 955 /* 956 * Every time the system oopses, if the oops happens while a reference 957 * to an object was held, the reference leaks. 958 * If the oops doesn't also leak memory, repeated oopsing can cause 959 * reference counters to wrap around (if they're not using refcount_t). 960 * This means that repeated oopsing can make unexploitable-looking bugs 961 * exploitable through repeated oopsing. 962 * To make sure this can't happen, place an upper bound on how often the 963 * kernel may oops without panic(). 964 */ 965 limit = READ_ONCE(oops_limit); 966 if (atomic_inc_return(&oops_count) >= limit && limit) 967 panic("Oopsed too often (kernel.oops_limit is %d)", limit); 968 969 /* 970 * We're taking recursive faults here in make_task_dead. Safest is to just 971 * leave this task alone and wait for reboot. 972 */ 973 if (unlikely(tsk->flags & PF_EXITING)) { 974 pr_alert("Fixing recursive fault but reboot is needed!\n"); 975 futex_exit_recursive(tsk); 976 tsk->exit_state = EXIT_DEAD; 977 refcount_inc(&tsk->rcu_users); 978 do_task_dead(); 979 } 980 981 do_exit(signr); 982 } 983 984 SYSCALL_DEFINE1(exit, int, error_code) 985 { 986 do_exit((error_code&0xff)<<8); 987 } 988 989 /* 990 * Take down every thread in the group. This is called by fatal signals 991 * as well as by sys_exit_group (below). 992 */ 993 void __noreturn 994 do_group_exit(int exit_code) 995 { 996 struct signal_struct *sig = current->signal; 997 998 if (sig->flags & SIGNAL_GROUP_EXIT) 999 exit_code = sig->group_exit_code; 1000 else if (sig->group_exec_task) 1001 exit_code = 0; 1002 else { 1003 struct sighand_struct *const sighand = current->sighand; 1004 1005 spin_lock_irq(&sighand->siglock); 1006 if (sig->flags & SIGNAL_GROUP_EXIT) 1007 /* Another thread got here before we took the lock. */ 1008 exit_code = sig->group_exit_code; 1009 else if (sig->group_exec_task) 1010 exit_code = 0; 1011 else { 1012 sig->group_exit_code = exit_code; 1013 sig->flags = SIGNAL_GROUP_EXIT; 1014 zap_other_threads(current); 1015 } 1016 spin_unlock_irq(&sighand->siglock); 1017 } 1018 1019 do_exit(exit_code); 1020 /* NOTREACHED */ 1021 } 1022 1023 /* 1024 * this kills every thread in the thread group. Note that any externally 1025 * wait4()-ing process will get the correct exit code - even if this 1026 * thread is not the thread group leader. 1027 */ 1028 SYSCALL_DEFINE1(exit_group, int, error_code) 1029 { 1030 do_group_exit((error_code & 0xff) << 8); 1031 /* NOTREACHED */ 1032 return 0; 1033 } 1034 1035 struct waitid_info { 1036 pid_t pid; 1037 uid_t uid; 1038 int status; 1039 int cause; 1040 }; 1041 1042 struct wait_opts { 1043 enum pid_type wo_type; 1044 int wo_flags; 1045 struct pid *wo_pid; 1046 1047 struct waitid_info *wo_info; 1048 int wo_stat; 1049 struct rusage *wo_rusage; 1050 1051 wait_queue_entry_t child_wait; 1052 int notask_error; 1053 }; 1054 1055 static int eligible_pid(struct wait_opts *wo, struct task_struct *p) 1056 { 1057 return wo->wo_type == PIDTYPE_MAX || 1058 task_pid_type(p, wo->wo_type) == wo->wo_pid; 1059 } 1060 1061 static int 1062 eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) 1063 { 1064 if (!eligible_pid(wo, p)) 1065 return 0; 1066 1067 /* 1068 * Wait for all children (clone and not) if __WALL is set or 1069 * if it is traced by us. 1070 */ 1071 if (ptrace || (wo->wo_flags & __WALL)) 1072 return 1; 1073 1074 /* 1075 * Otherwise, wait for clone children *only* if __WCLONE is set; 1076 * otherwise, wait for non-clone children *only*. 1077 * 1078 * Note: a "clone" child here is one that reports to its parent 1079 * using a signal other than SIGCHLD, or a non-leader thread which 1080 * we can only see if it is traced by us. 1081 */ 1082 if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) 1083 return 0; 1084 1085 return 1; 1086 } 1087 1088 /* 1089 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1090 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1091 * the lock and this task is uninteresting. If we return nonzero, we have 1092 * released the lock and the system call should return. 1093 */ 1094 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1095 { 1096 int state, status; 1097 pid_t pid = task_pid_vnr(p); 1098 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1099 struct waitid_info *infop; 1100 1101 if (!likely(wo->wo_flags & WEXITED)) 1102 return 0; 1103 1104 if (unlikely(wo->wo_flags & WNOWAIT)) { 1105 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1106 ? p->signal->group_exit_code : p->exit_code; 1107 get_task_struct(p); 1108 read_unlock(&tasklist_lock); 1109 sched_annotate_sleep(); 1110 if (wo->wo_rusage) 1111 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1112 put_task_struct(p); 1113 goto out_info; 1114 } 1115 /* 1116 * Move the task's state to DEAD/TRACE, only one thread can do this. 1117 */ 1118 state = (ptrace_reparented(p) && thread_group_leader(p)) ? 1119 EXIT_TRACE : EXIT_DEAD; 1120 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1121 return 0; 1122 /* 1123 * We own this thread, nobody else can reap it. 1124 */ 1125 read_unlock(&tasklist_lock); 1126 sched_annotate_sleep(); 1127 1128 /* 1129 * Check thread_group_leader() to exclude the traced sub-threads. 1130 */ 1131 if (state == EXIT_DEAD && thread_group_leader(p)) { 1132 struct signal_struct *sig = p->signal; 1133 struct signal_struct *psig = current->signal; 1134 unsigned long maxrss; 1135 u64 tgutime, tgstime; 1136 1137 /* 1138 * The resource counters for the group leader are in its 1139 * own task_struct. Those for dead threads in the group 1140 * are in its signal_struct, as are those for the child 1141 * processes it has previously reaped. All these 1142 * accumulate in the parent's signal_struct c* fields. 1143 * 1144 * We don't bother to take a lock here to protect these 1145 * p->signal fields because the whole thread group is dead 1146 * and nobody can change them. 1147 * 1148 * psig->stats_lock also protects us from our sub-threads 1149 * which can reap other children at the same time. Until 1150 * we change k_getrusage()-like users to rely on this lock 1151 * we have to take ->siglock as well. 1152 * 1153 * We use thread_group_cputime_adjusted() to get times for 1154 * the thread group, which consolidates times for all threads 1155 * in the group including the group leader. 1156 */ 1157 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1158 spin_lock_irq(¤t->sighand->siglock); 1159 write_seqlock(&psig->stats_lock); 1160 psig->cutime += tgutime + sig->cutime; 1161 psig->cstime += tgstime + sig->cstime; 1162 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1163 psig->cmin_flt += 1164 p->min_flt + sig->min_flt + sig->cmin_flt; 1165 psig->cmaj_flt += 1166 p->maj_flt + sig->maj_flt + sig->cmaj_flt; 1167 psig->cnvcsw += 1168 p->nvcsw + sig->nvcsw + sig->cnvcsw; 1169 psig->cnivcsw += 1170 p->nivcsw + sig->nivcsw + sig->cnivcsw; 1171 psig->cinblock += 1172 task_io_get_inblock(p) + 1173 sig->inblock + sig->cinblock; 1174 psig->coublock += 1175 task_io_get_oublock(p) + 1176 sig->oublock + sig->coublock; 1177 maxrss = max(sig->maxrss, sig->cmaxrss); 1178 if (psig->cmaxrss < maxrss) 1179 psig->cmaxrss = maxrss; 1180 task_io_accounting_add(&psig->ioac, &p->ioac); 1181 task_io_accounting_add(&psig->ioac, &sig->ioac); 1182 write_sequnlock(&psig->stats_lock); 1183 spin_unlock_irq(¤t->sighand->siglock); 1184 } 1185 1186 if (wo->wo_rusage) 1187 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1188 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1189 ? p->signal->group_exit_code : p->exit_code; 1190 wo->wo_stat = status; 1191 1192 if (state == EXIT_TRACE) { 1193 write_lock_irq(&tasklist_lock); 1194 /* We dropped tasklist, ptracer could die and untrace */ 1195 ptrace_unlink(p); 1196 1197 /* If parent wants a zombie, don't release it now */ 1198 state = EXIT_ZOMBIE; 1199 if (do_notify_parent(p, p->exit_signal)) 1200 state = EXIT_DEAD; 1201 p->exit_state = state; 1202 write_unlock_irq(&tasklist_lock); 1203 } 1204 if (state == EXIT_DEAD) 1205 release_task(p); 1206 1207 out_info: 1208 infop = wo->wo_info; 1209 if (infop) { 1210 if ((status & 0x7f) == 0) { 1211 infop->cause = CLD_EXITED; 1212 infop->status = status >> 8; 1213 } else { 1214 infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; 1215 infop->status = status & 0x7f; 1216 } 1217 infop->pid = pid; 1218 infop->uid = uid; 1219 } 1220 1221 return pid; 1222 } 1223 1224 static int *task_stopped_code(struct task_struct *p, bool ptrace) 1225 { 1226 if (ptrace) { 1227 if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) 1228 return &p->exit_code; 1229 } else { 1230 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1231 return &p->signal->group_exit_code; 1232 } 1233 return NULL; 1234 } 1235 1236 /** 1237 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED 1238 * @wo: wait options 1239 * @ptrace: is the wait for ptrace 1240 * @p: task to wait for 1241 * 1242 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. 1243 * 1244 * CONTEXT: 1245 * read_lock(&tasklist_lock), which is released if return value is 1246 * non-zero. Also, grabs and releases @p->sighand->siglock. 1247 * 1248 * RETURNS: 1249 * 0 if wait condition didn't exist and search for other wait conditions 1250 * should continue. Non-zero return, -errno on failure and @p's pid on 1251 * success, implies that tasklist_lock is released and wait condition 1252 * search should terminate. 1253 */ 1254 static int wait_task_stopped(struct wait_opts *wo, 1255 int ptrace, struct task_struct *p) 1256 { 1257 struct waitid_info *infop; 1258 int exit_code, *p_code, why; 1259 uid_t uid = 0; /* unneeded, required by compiler */ 1260 pid_t pid; 1261 1262 /* 1263 * Traditionally we see ptrace'd stopped tasks regardless of options. 1264 */ 1265 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1266 return 0; 1267 1268 if (!task_stopped_code(p, ptrace)) 1269 return 0; 1270 1271 exit_code = 0; 1272 spin_lock_irq(&p->sighand->siglock); 1273 1274 p_code = task_stopped_code(p, ptrace); 1275 if (unlikely(!p_code)) 1276 goto unlock_sig; 1277 1278 exit_code = *p_code; 1279 if (!exit_code) 1280 goto unlock_sig; 1281 1282 if (!unlikely(wo->wo_flags & WNOWAIT)) 1283 *p_code = 0; 1284 1285 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1286 unlock_sig: 1287 spin_unlock_irq(&p->sighand->siglock); 1288 if (!exit_code) 1289 return 0; 1290 1291 /* 1292 * Now we are pretty sure this task is interesting. 1293 * Make sure it doesn't get reaped out from under us while we 1294 * give up the lock and then examine it below. We don't want to 1295 * keep holding onto the tasklist_lock while we call getrusage and 1296 * possibly take page faults for user memory. 1297 */ 1298 get_task_struct(p); 1299 pid = task_pid_vnr(p); 1300 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1301 read_unlock(&tasklist_lock); 1302 sched_annotate_sleep(); 1303 if (wo->wo_rusage) 1304 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1305 put_task_struct(p); 1306 1307 if (likely(!(wo->wo_flags & WNOWAIT))) 1308 wo->wo_stat = (exit_code << 8) | 0x7f; 1309 1310 infop = wo->wo_info; 1311 if (infop) { 1312 infop->cause = why; 1313 infop->status = exit_code; 1314 infop->pid = pid; 1315 infop->uid = uid; 1316 } 1317 return pid; 1318 } 1319 1320 /* 1321 * Handle do_wait work for one task in a live, non-stopped state. 1322 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1323 * the lock and this task is uninteresting. If we return nonzero, we have 1324 * released the lock and the system call should return. 1325 */ 1326 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1327 { 1328 struct waitid_info *infop; 1329 pid_t pid; 1330 uid_t uid; 1331 1332 if (!unlikely(wo->wo_flags & WCONTINUED)) 1333 return 0; 1334 1335 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1336 return 0; 1337 1338 spin_lock_irq(&p->sighand->siglock); 1339 /* Re-check with the lock held. */ 1340 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { 1341 spin_unlock_irq(&p->sighand->siglock); 1342 return 0; 1343 } 1344 if (!unlikely(wo->wo_flags & WNOWAIT)) 1345 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1346 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1347 spin_unlock_irq(&p->sighand->siglock); 1348 1349 pid = task_pid_vnr(p); 1350 get_task_struct(p); 1351 read_unlock(&tasklist_lock); 1352 sched_annotate_sleep(); 1353 if (wo->wo_rusage) 1354 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1355 put_task_struct(p); 1356 1357 infop = wo->wo_info; 1358 if (!infop) { 1359 wo->wo_stat = 0xffff; 1360 } else { 1361 infop->cause = CLD_CONTINUED; 1362 infop->pid = pid; 1363 infop->uid = uid; 1364 infop->status = SIGCONT; 1365 } 1366 return pid; 1367 } 1368 1369 /* 1370 * Consider @p for a wait by @parent. 1371 * 1372 * -ECHILD should be in ->notask_error before the first call. 1373 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1374 * Returns zero if the search for a child should continue; 1375 * then ->notask_error is 0 if @p is an eligible child, 1376 * or still -ECHILD. 1377 */ 1378 static int wait_consider_task(struct wait_opts *wo, int ptrace, 1379 struct task_struct *p) 1380 { 1381 /* 1382 * We can race with wait_task_zombie() from another thread. 1383 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition 1384 * can't confuse the checks below. 1385 */ 1386 int exit_state = READ_ONCE(p->exit_state); 1387 int ret; 1388 1389 if (unlikely(exit_state == EXIT_DEAD)) 1390 return 0; 1391 1392 ret = eligible_child(wo, ptrace, p); 1393 if (!ret) 1394 return ret; 1395 1396 if (unlikely(exit_state == EXIT_TRACE)) { 1397 /* 1398 * ptrace == 0 means we are the natural parent. In this case 1399 * we should clear notask_error, debugger will notify us. 1400 */ 1401 if (likely(!ptrace)) 1402 wo->notask_error = 0; 1403 return 0; 1404 } 1405 1406 if (likely(!ptrace) && unlikely(p->ptrace)) { 1407 /* 1408 * If it is traced by its real parent's group, just pretend 1409 * the caller is ptrace_do_wait() and reap this child if it 1410 * is zombie. 1411 * 1412 * This also hides group stop state from real parent; otherwise 1413 * a single stop can be reported twice as group and ptrace stop. 1414 * If a ptracer wants to distinguish these two events for its 1415 * own children it should create a separate process which takes 1416 * the role of real parent. 1417 */ 1418 if (!ptrace_reparented(p)) 1419 ptrace = 1; 1420 } 1421 1422 /* slay zombie? */ 1423 if (exit_state == EXIT_ZOMBIE) { 1424 /* we don't reap group leaders with subthreads */ 1425 if (!delay_group_leader(p)) { 1426 /* 1427 * A zombie ptracee is only visible to its ptracer. 1428 * Notification and reaping will be cascaded to the 1429 * real parent when the ptracer detaches. 1430 */ 1431 if (unlikely(ptrace) || likely(!p->ptrace)) 1432 return wait_task_zombie(wo, p); 1433 } 1434 1435 /* 1436 * Allow access to stopped/continued state via zombie by 1437 * falling through. Clearing of notask_error is complex. 1438 * 1439 * When !@ptrace: 1440 * 1441 * If WEXITED is set, notask_error should naturally be 1442 * cleared. If not, subset of WSTOPPED|WCONTINUED is set, 1443 * so, if there are live subthreads, there are events to 1444 * wait for. If all subthreads are dead, it's still safe 1445 * to clear - this function will be called again in finite 1446 * amount time once all the subthreads are released and 1447 * will then return without clearing. 1448 * 1449 * When @ptrace: 1450 * 1451 * Stopped state is per-task and thus can't change once the 1452 * target task dies. Only continued and exited can happen. 1453 * Clear notask_error if WCONTINUED | WEXITED. 1454 */ 1455 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) 1456 wo->notask_error = 0; 1457 } else { 1458 /* 1459 * @p is alive and it's gonna stop, continue or exit, so 1460 * there always is something to wait for. 1461 */ 1462 wo->notask_error = 0; 1463 } 1464 1465 /* 1466 * Wait for stopped. Depending on @ptrace, different stopped state 1467 * is used and the two don't interact with each other. 1468 */ 1469 ret = wait_task_stopped(wo, ptrace, p); 1470 if (ret) 1471 return ret; 1472 1473 /* 1474 * Wait for continued. There's only one continued state and the 1475 * ptracer can consume it which can confuse the real parent. Don't 1476 * use WCONTINUED from ptracer. You don't need or want it. 1477 */ 1478 return wait_task_continued(wo, p); 1479 } 1480 1481 /* 1482 * Do the work of do_wait() for one thread in the group, @tsk. 1483 * 1484 * -ECHILD should be in ->notask_error before the first call. 1485 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1486 * Returns zero if the search for a child should continue; then 1487 * ->notask_error is 0 if there were any eligible children, 1488 * or still -ECHILD. 1489 */ 1490 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) 1491 { 1492 struct task_struct *p; 1493 1494 list_for_each_entry(p, &tsk->children, sibling) { 1495 int ret = wait_consider_task(wo, 0, p); 1496 1497 if (ret) 1498 return ret; 1499 } 1500 1501 return 0; 1502 } 1503 1504 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) 1505 { 1506 struct task_struct *p; 1507 1508 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1509 int ret = wait_consider_task(wo, 1, p); 1510 1511 if (ret) 1512 return ret; 1513 } 1514 1515 return 0; 1516 } 1517 1518 static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, 1519 int sync, void *key) 1520 { 1521 struct wait_opts *wo = container_of(wait, struct wait_opts, 1522 child_wait); 1523 struct task_struct *p = key; 1524 1525 if (!eligible_pid(wo, p)) 1526 return 0; 1527 1528 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) 1529 return 0; 1530 1531 return default_wake_function(wait, mode, sync, key); 1532 } 1533 1534 void __wake_up_parent(struct task_struct *p, struct task_struct *parent) 1535 { 1536 __wake_up_sync_key(&parent->signal->wait_chldexit, 1537 TASK_INTERRUPTIBLE, p); 1538 } 1539 1540 static bool is_effectively_child(struct wait_opts *wo, bool ptrace, 1541 struct task_struct *target) 1542 { 1543 struct task_struct *parent = 1544 !ptrace ? target->real_parent : target->parent; 1545 1546 return current == parent || (!(wo->wo_flags & __WNOTHREAD) && 1547 same_thread_group(current, parent)); 1548 } 1549 1550 /* 1551 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child 1552 * and tracee lists to find the target task. 1553 */ 1554 static int do_wait_pid(struct wait_opts *wo) 1555 { 1556 bool ptrace; 1557 struct task_struct *target; 1558 int retval; 1559 1560 ptrace = false; 1561 target = pid_task(wo->wo_pid, PIDTYPE_TGID); 1562 if (target && is_effectively_child(wo, ptrace, target)) { 1563 retval = wait_consider_task(wo, ptrace, target); 1564 if (retval) 1565 return retval; 1566 } 1567 1568 ptrace = true; 1569 target = pid_task(wo->wo_pid, PIDTYPE_PID); 1570 if (target && target->ptrace && 1571 is_effectively_child(wo, ptrace, target)) { 1572 retval = wait_consider_task(wo, ptrace, target); 1573 if (retval) 1574 return retval; 1575 } 1576 1577 return 0; 1578 } 1579 1580 static long do_wait(struct wait_opts *wo) 1581 { 1582 int retval; 1583 1584 trace_sched_process_wait(wo->wo_pid); 1585 1586 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); 1587 wo->child_wait.private = current; 1588 add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1589 repeat: 1590 /* 1591 * If there is nothing that can match our criteria, just get out. 1592 * We will clear ->notask_error to zero if we see any child that 1593 * might later match our criteria, even if we are not able to reap 1594 * it yet. 1595 */ 1596 wo->notask_error = -ECHILD; 1597 if ((wo->wo_type < PIDTYPE_MAX) && 1598 (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) 1599 goto notask; 1600 1601 set_current_state(TASK_INTERRUPTIBLE); 1602 read_lock(&tasklist_lock); 1603 1604 if (wo->wo_type == PIDTYPE_PID) { 1605 retval = do_wait_pid(wo); 1606 if (retval) 1607 goto end; 1608 } else { 1609 struct task_struct *tsk = current; 1610 1611 do { 1612 retval = do_wait_thread(wo, tsk); 1613 if (retval) 1614 goto end; 1615 1616 retval = ptrace_do_wait(wo, tsk); 1617 if (retval) 1618 goto end; 1619 1620 if (wo->wo_flags & __WNOTHREAD) 1621 break; 1622 } while_each_thread(current, tsk); 1623 } 1624 read_unlock(&tasklist_lock); 1625 1626 notask: 1627 retval = wo->notask_error; 1628 if (!retval && !(wo->wo_flags & WNOHANG)) { 1629 retval = -ERESTARTSYS; 1630 if (!signal_pending(current)) { 1631 schedule(); 1632 goto repeat; 1633 } 1634 } 1635 end: 1636 __set_current_state(TASK_RUNNING); 1637 remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1638 return retval; 1639 } 1640 1641 static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, 1642 int options, struct rusage *ru) 1643 { 1644 struct wait_opts wo; 1645 struct pid *pid = NULL; 1646 enum pid_type type; 1647 long ret; 1648 unsigned int f_flags = 0; 1649 1650 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| 1651 __WNOTHREAD|__WCLONE|__WALL)) 1652 return -EINVAL; 1653 if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) 1654 return -EINVAL; 1655 1656 switch (which) { 1657 case P_ALL: 1658 type = PIDTYPE_MAX; 1659 break; 1660 case P_PID: 1661 type = PIDTYPE_PID; 1662 if (upid <= 0) 1663 return -EINVAL; 1664 1665 pid = find_get_pid(upid); 1666 break; 1667 case P_PGID: 1668 type = PIDTYPE_PGID; 1669 if (upid < 0) 1670 return -EINVAL; 1671 1672 if (upid) 1673 pid = find_get_pid(upid); 1674 else 1675 pid = get_task_pid(current, PIDTYPE_PGID); 1676 break; 1677 case P_PIDFD: 1678 type = PIDTYPE_PID; 1679 if (upid < 0) 1680 return -EINVAL; 1681 1682 pid = pidfd_get_pid(upid, &f_flags); 1683 if (IS_ERR(pid)) 1684 return PTR_ERR(pid); 1685 1686 break; 1687 default: 1688 return -EINVAL; 1689 } 1690 1691 wo.wo_type = type; 1692 wo.wo_pid = pid; 1693 wo.wo_flags = options; 1694 wo.wo_info = infop; 1695 wo.wo_rusage = ru; 1696 if (f_flags & O_NONBLOCK) 1697 wo.wo_flags |= WNOHANG; 1698 1699 ret = do_wait(&wo); 1700 if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) 1701 ret = -EAGAIN; 1702 1703 put_pid(pid); 1704 return ret; 1705 } 1706 1707 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1708 infop, int, options, struct rusage __user *, ru) 1709 { 1710 struct rusage r; 1711 struct waitid_info info = {.status = 0}; 1712 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); 1713 int signo = 0; 1714 1715 if (err > 0) { 1716 signo = SIGCHLD; 1717 err = 0; 1718 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1719 return -EFAULT; 1720 } 1721 if (!infop) 1722 return err; 1723 1724 if (!user_write_access_begin(infop, sizeof(*infop))) 1725 return -EFAULT; 1726 1727 unsafe_put_user(signo, &infop->si_signo, Efault); 1728 unsafe_put_user(0, &infop->si_errno, Efault); 1729 unsafe_put_user(info.cause, &infop->si_code, Efault); 1730 unsafe_put_user(info.pid, &infop->si_pid, Efault); 1731 unsafe_put_user(info.uid, &infop->si_uid, Efault); 1732 unsafe_put_user(info.status, &infop->si_status, Efault); 1733 user_write_access_end(); 1734 return err; 1735 Efault: 1736 user_write_access_end(); 1737 return -EFAULT; 1738 } 1739 1740 long kernel_wait4(pid_t upid, int __user *stat_addr, int options, 1741 struct rusage *ru) 1742 { 1743 struct wait_opts wo; 1744 struct pid *pid = NULL; 1745 enum pid_type type; 1746 long ret; 1747 1748 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| 1749 __WNOTHREAD|__WCLONE|__WALL)) 1750 return -EINVAL; 1751 1752 /* -INT_MIN is not defined */ 1753 if (upid == INT_MIN) 1754 return -ESRCH; 1755 1756 if (upid == -1) 1757 type = PIDTYPE_MAX; 1758 else if (upid < 0) { 1759 type = PIDTYPE_PGID; 1760 pid = find_get_pid(-upid); 1761 } else if (upid == 0) { 1762 type = PIDTYPE_PGID; 1763 pid = get_task_pid(current, PIDTYPE_PGID); 1764 } else /* upid > 0 */ { 1765 type = PIDTYPE_PID; 1766 pid = find_get_pid(upid); 1767 } 1768 1769 wo.wo_type = type; 1770 wo.wo_pid = pid; 1771 wo.wo_flags = options | WEXITED; 1772 wo.wo_info = NULL; 1773 wo.wo_stat = 0; 1774 wo.wo_rusage = ru; 1775 ret = do_wait(&wo); 1776 put_pid(pid); 1777 if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) 1778 ret = -EFAULT; 1779 1780 return ret; 1781 } 1782 1783 int kernel_wait(pid_t pid, int *stat) 1784 { 1785 struct wait_opts wo = { 1786 .wo_type = PIDTYPE_PID, 1787 .wo_pid = find_get_pid(pid), 1788 .wo_flags = WEXITED, 1789 }; 1790 int ret; 1791 1792 ret = do_wait(&wo); 1793 if (ret > 0 && wo.wo_stat) 1794 *stat = wo.wo_stat; 1795 put_pid(wo.wo_pid); 1796 return ret; 1797 } 1798 1799 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1800 int, options, struct rusage __user *, ru) 1801 { 1802 struct rusage r; 1803 long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); 1804 1805 if (err > 0) { 1806 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1807 return -EFAULT; 1808 } 1809 return err; 1810 } 1811 1812 #ifdef __ARCH_WANT_SYS_WAITPID 1813 1814 /* 1815 * sys_waitpid() remains for compatibility. waitpid() should be 1816 * implemented by calling sys_wait4() from libc.a. 1817 */ 1818 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) 1819 { 1820 return kernel_wait4(pid, stat_addr, options, NULL); 1821 } 1822 1823 #endif 1824 1825 #ifdef CONFIG_COMPAT 1826 COMPAT_SYSCALL_DEFINE4(wait4, 1827 compat_pid_t, pid, 1828 compat_uint_t __user *, stat_addr, 1829 int, options, 1830 struct compat_rusage __user *, ru) 1831 { 1832 struct rusage r; 1833 long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); 1834 if (err > 0) { 1835 if (ru && put_compat_rusage(&r, ru)) 1836 return -EFAULT; 1837 } 1838 return err; 1839 } 1840 1841 COMPAT_SYSCALL_DEFINE5(waitid, 1842 int, which, compat_pid_t, pid, 1843 struct compat_siginfo __user *, infop, int, options, 1844 struct compat_rusage __user *, uru) 1845 { 1846 struct rusage ru; 1847 struct waitid_info info = {.status = 0}; 1848 long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); 1849 int signo = 0; 1850 if (err > 0) { 1851 signo = SIGCHLD; 1852 err = 0; 1853 if (uru) { 1854 /* kernel_waitid() overwrites everything in ru */ 1855 if (COMPAT_USE_64BIT_TIME) 1856 err = copy_to_user(uru, &ru, sizeof(ru)); 1857 else 1858 err = put_compat_rusage(&ru, uru); 1859 if (err) 1860 return -EFAULT; 1861 } 1862 } 1863 1864 if (!infop) 1865 return err; 1866 1867 if (!user_write_access_begin(infop, sizeof(*infop))) 1868 return -EFAULT; 1869 1870 unsafe_put_user(signo, &infop->si_signo, Efault); 1871 unsafe_put_user(0, &infop->si_errno, Efault); 1872 unsafe_put_user(info.cause, &infop->si_code, Efault); 1873 unsafe_put_user(info.pid, &infop->si_pid, Efault); 1874 unsafe_put_user(info.uid, &infop->si_uid, Efault); 1875 unsafe_put_user(info.status, &infop->si_status, Efault); 1876 user_write_access_end(); 1877 return err; 1878 Efault: 1879 user_write_access_end(); 1880 return -EFAULT; 1881 } 1882 #endif 1883 1884 /** 1885 * thread_group_exited - check that a thread group has exited 1886 * @pid: tgid of thread group to be checked. 1887 * 1888 * Test if the thread group represented by tgid has exited (all 1889 * threads are zombies, dead or completely gone). 1890 * 1891 * Return: true if the thread group has exited. false otherwise. 1892 */ 1893 bool thread_group_exited(struct pid *pid) 1894 { 1895 struct task_struct *task; 1896 bool exited; 1897 1898 rcu_read_lock(); 1899 task = pid_task(pid, PIDTYPE_PID); 1900 exited = !task || 1901 (READ_ONCE(task->exit_state) && thread_group_empty(task)); 1902 rcu_read_unlock(); 1903 1904 return exited; 1905 } 1906 EXPORT_SYMBOL(thread_group_exited); 1907 1908 /* 1909 * This needs to be __function_aligned as GCC implicitly makes any 1910 * implementation of abort() cold and drops alignment specified by 1911 * -falign-functions=N. 1912 * 1913 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 1914 */ 1915 __weak __function_aligned void abort(void) 1916 { 1917 BUG(); 1918 1919 /* if that doesn't kill us, halt */ 1920 panic("Oops failed to kill thread"); 1921 } 1922 EXPORT_SYMBOL(abort); 1923