1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/kernel/exit.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/slab.h> 10 #include <linux/sched/autogroup.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/stat.h> 13 #include <linux/sched/task.h> 14 #include <linux/sched/task_stack.h> 15 #include <linux/sched/cputime.h> 16 #include <linux/interrupt.h> 17 #include <linux/module.h> 18 #include <linux/capability.h> 19 #include <linux/completion.h> 20 #include <linux/personality.h> 21 #include <linux/tty.h> 22 #include <linux/iocontext.h> 23 #include <linux/key.h> 24 #include <linux/cpu.h> 25 #include <linux/acct.h> 26 #include <linux/tsacct_kern.h> 27 #include <linux/file.h> 28 #include <linux/fdtable.h> 29 #include <linux/freezer.h> 30 #include <linux/binfmts.h> 31 #include <linux/nsproxy.h> 32 #include <linux/pid_namespace.h> 33 #include <linux/ptrace.h> 34 #include <linux/profile.h> 35 #include <linux/mount.h> 36 #include <linux/proc_fs.h> 37 #include <linux/kthread.h> 38 #include <linux/mempolicy.h> 39 #include <linux/taskstats_kern.h> 40 #include <linux/delayacct.h> 41 #include <linux/cgroup.h> 42 #include <linux/syscalls.h> 43 #include <linux/signal.h> 44 #include <linux/posix-timers.h> 45 #include <linux/cn_proc.h> 46 #include <linux/mutex.h> 47 #include <linux/futex.h> 48 #include <linux/pipe_fs_i.h> 49 #include <linux/audit.h> /* for audit_free() */ 50 #include <linux/resource.h> 51 #include <linux/task_io_accounting_ops.h> 52 #include <linux/blkdev.h> 53 #include <linux/task_work.h> 54 #include <linux/fs_struct.h> 55 #include <linux/init_task.h> 56 #include <linux/perf_event.h> 57 #include <trace/events/sched.h> 58 #include <linux/hw_breakpoint.h> 59 #include <linux/oom.h> 60 #include <linux/writeback.h> 61 #include <linux/shm.h> 62 #include <linux/kcov.h> 63 #include <linux/kmsan.h> 64 #include <linux/random.h> 65 #include <linux/rcuwait.h> 66 #include <linux/compat.h> 67 #include <linux/io_uring.h> 68 #include <linux/kprobes.h> 69 #include <linux/rethook.h> 70 #include <linux/sysfs.h> 71 72 #include <linux/uaccess.h> 73 #include <asm/unistd.h> 74 #include <asm/mmu_context.h> 75 76 /* 77 * The default value should be high enough to not crash a system that randomly 78 * crashes its kernel from time to time, but low enough to at least not permit 79 * overflowing 32-bit refcounts or the ldsem writer count. 80 */ 81 static unsigned int oops_limit = 10000; 82 83 #ifdef CONFIG_SYSCTL 84 static struct ctl_table kern_exit_table[] = { 85 { 86 .procname = "oops_limit", 87 .data = &oops_limit, 88 .maxlen = sizeof(oops_limit), 89 .mode = 0644, 90 .proc_handler = proc_douintvec, 91 }, 92 { } 93 }; 94 95 static __init int kernel_exit_sysctls_init(void) 96 { 97 register_sysctl_init("kernel", kern_exit_table); 98 return 0; 99 } 100 late_initcall(kernel_exit_sysctls_init); 101 #endif 102 103 static atomic_t oops_count = ATOMIC_INIT(0); 104 105 #ifdef CONFIG_SYSFS 106 static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, 107 char *page) 108 { 109 return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); 110 } 111 112 static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); 113 114 static __init int kernel_exit_sysfs_init(void) 115 { 116 sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); 117 return 0; 118 } 119 late_initcall(kernel_exit_sysfs_init); 120 #endif 121 122 static void __unhash_process(struct task_struct *p, bool group_dead) 123 { 124 nr_threads--; 125 detach_pid(p, PIDTYPE_PID); 126 if (group_dead) { 127 detach_pid(p, PIDTYPE_TGID); 128 detach_pid(p, PIDTYPE_PGID); 129 detach_pid(p, PIDTYPE_SID); 130 131 list_del_rcu(&p->tasks); 132 list_del_init(&p->sibling); 133 __this_cpu_dec(process_counts); 134 } 135 list_del_rcu(&p->thread_group); 136 list_del_rcu(&p->thread_node); 137 } 138 139 /* 140 * This function expects the tasklist_lock write-locked. 141 */ 142 static void __exit_signal(struct task_struct *tsk) 143 { 144 struct signal_struct *sig = tsk->signal; 145 bool group_dead = thread_group_leader(tsk); 146 struct sighand_struct *sighand; 147 struct tty_struct *tty; 148 u64 utime, stime; 149 150 sighand = rcu_dereference_check(tsk->sighand, 151 lockdep_tasklist_lock_is_held()); 152 spin_lock(&sighand->siglock); 153 154 #ifdef CONFIG_POSIX_TIMERS 155 posix_cpu_timers_exit(tsk); 156 if (group_dead) 157 posix_cpu_timers_exit_group(tsk); 158 #endif 159 160 if (group_dead) { 161 tty = sig->tty; 162 sig->tty = NULL; 163 } else { 164 /* 165 * If there is any task waiting for the group exit 166 * then notify it: 167 */ 168 if (sig->notify_count > 0 && !--sig->notify_count) 169 wake_up_process(sig->group_exec_task); 170 171 if (tsk == sig->curr_target) 172 sig->curr_target = next_thread(tsk); 173 } 174 175 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 176 sizeof(unsigned long long)); 177 178 /* 179 * Accumulate here the counters for all threads as they die. We could 180 * skip the group leader because it is the last user of signal_struct, 181 * but we want to avoid the race with thread_group_cputime() which can 182 * see the empty ->thread_head list. 183 */ 184 task_cputime(tsk, &utime, &stime); 185 write_seqlock(&sig->stats_lock); 186 sig->utime += utime; 187 sig->stime += stime; 188 sig->gtime += task_gtime(tsk); 189 sig->min_flt += tsk->min_flt; 190 sig->maj_flt += tsk->maj_flt; 191 sig->nvcsw += tsk->nvcsw; 192 sig->nivcsw += tsk->nivcsw; 193 sig->inblock += task_io_get_inblock(tsk); 194 sig->oublock += task_io_get_oublock(tsk); 195 task_io_accounting_add(&sig->ioac, &tsk->ioac); 196 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 197 sig->nr_threads--; 198 __unhash_process(tsk, group_dead); 199 write_sequnlock(&sig->stats_lock); 200 201 /* 202 * Do this under ->siglock, we can race with another thread 203 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 204 */ 205 flush_sigqueue(&tsk->pending); 206 tsk->sighand = NULL; 207 spin_unlock(&sighand->siglock); 208 209 __cleanup_sighand(sighand); 210 clear_tsk_thread_flag(tsk, TIF_SIGPENDING); 211 if (group_dead) { 212 flush_sigqueue(&sig->shared_pending); 213 tty_kref_put(tty); 214 } 215 } 216 217 static void delayed_put_task_struct(struct rcu_head *rhp) 218 { 219 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 220 221 kprobe_flush_task(tsk); 222 rethook_flush_task(tsk); 223 perf_event_delayed_put(tsk); 224 trace_sched_process_free(tsk); 225 put_task_struct(tsk); 226 } 227 228 void put_task_struct_rcu_user(struct task_struct *task) 229 { 230 if (refcount_dec_and_test(&task->rcu_users)) 231 call_rcu(&task->rcu, delayed_put_task_struct); 232 } 233 234 void __weak release_thread(struct task_struct *dead_task) 235 { 236 } 237 238 void release_task(struct task_struct *p) 239 { 240 struct task_struct *leader; 241 struct pid *thread_pid; 242 int zap_leader; 243 repeat: 244 /* don't need to get the RCU readlock here - the process is dead and 245 * can't be modifying its own credentials. But shut RCU-lockdep up */ 246 rcu_read_lock(); 247 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 248 rcu_read_unlock(); 249 250 cgroup_release(p); 251 252 write_lock_irq(&tasklist_lock); 253 ptrace_release_task(p); 254 thread_pid = get_pid(p->thread_pid); 255 __exit_signal(p); 256 257 /* 258 * If we are the last non-leader member of the thread 259 * group, and the leader is zombie, then notify the 260 * group leader's parent process. (if it wants notification.) 261 */ 262 zap_leader = 0; 263 leader = p->group_leader; 264 if (leader != p && thread_group_empty(leader) 265 && leader->exit_state == EXIT_ZOMBIE) { 266 /* 267 * If we were the last child thread and the leader has 268 * exited already, and the leader's parent ignores SIGCHLD, 269 * then we are the one who should release the leader. 270 */ 271 zap_leader = do_notify_parent(leader, leader->exit_signal); 272 if (zap_leader) 273 leader->exit_state = EXIT_DEAD; 274 } 275 276 write_unlock_irq(&tasklist_lock); 277 seccomp_filter_release(p); 278 proc_flush_pid(thread_pid); 279 put_pid(thread_pid); 280 release_thread(p); 281 put_task_struct_rcu_user(p); 282 283 p = leader; 284 if (unlikely(zap_leader)) 285 goto repeat; 286 } 287 288 int rcuwait_wake_up(struct rcuwait *w) 289 { 290 int ret = 0; 291 struct task_struct *task; 292 293 rcu_read_lock(); 294 295 /* 296 * Order condition vs @task, such that everything prior to the load 297 * of @task is visible. This is the condition as to why the user called 298 * rcuwait_wake() in the first place. Pairs with set_current_state() 299 * barrier (A) in rcuwait_wait_event(). 300 * 301 * WAIT WAKE 302 * [S] tsk = current [S] cond = true 303 * MB (A) MB (B) 304 * [L] cond [L] tsk 305 */ 306 smp_mb(); /* (B) */ 307 308 task = rcu_dereference(w->task); 309 if (task) 310 ret = wake_up_process(task); 311 rcu_read_unlock(); 312 313 return ret; 314 } 315 EXPORT_SYMBOL_GPL(rcuwait_wake_up); 316 317 /* 318 * Determine if a process group is "orphaned", according to the POSIX 319 * definition in 2.2.2.52. Orphaned process groups are not to be affected 320 * by terminal-generated stop signals. Newly orphaned process groups are 321 * to receive a SIGHUP and a SIGCONT. 322 * 323 * "I ask you, have you ever known what it is to be an orphan?" 324 */ 325 static int will_become_orphaned_pgrp(struct pid *pgrp, 326 struct task_struct *ignored_task) 327 { 328 struct task_struct *p; 329 330 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 331 if ((p == ignored_task) || 332 (p->exit_state && thread_group_empty(p)) || 333 is_global_init(p->real_parent)) 334 continue; 335 336 if (task_pgrp(p->real_parent) != pgrp && 337 task_session(p->real_parent) == task_session(p)) 338 return 0; 339 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 340 341 return 1; 342 } 343 344 int is_current_pgrp_orphaned(void) 345 { 346 int retval; 347 348 read_lock(&tasklist_lock); 349 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); 350 read_unlock(&tasklist_lock); 351 352 return retval; 353 } 354 355 static bool has_stopped_jobs(struct pid *pgrp) 356 { 357 struct task_struct *p; 358 359 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 360 if (p->signal->flags & SIGNAL_STOP_STOPPED) 361 return true; 362 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 363 364 return false; 365 } 366 367 /* 368 * Check to see if any process groups have become orphaned as 369 * a result of our exiting, and if they have any stopped jobs, 370 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 371 */ 372 static void 373 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) 374 { 375 struct pid *pgrp = task_pgrp(tsk); 376 struct task_struct *ignored_task = tsk; 377 378 if (!parent) 379 /* exit: our father is in a different pgrp than 380 * we are and we were the only connection outside. 381 */ 382 parent = tsk->real_parent; 383 else 384 /* reparent: our child is in a different pgrp than 385 * we are, and it was the only connection outside. 386 */ 387 ignored_task = NULL; 388 389 if (task_pgrp(parent) != pgrp && 390 task_session(parent) == task_session(tsk) && 391 will_become_orphaned_pgrp(pgrp, ignored_task) && 392 has_stopped_jobs(pgrp)) { 393 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); 394 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); 395 } 396 } 397 398 static void coredump_task_exit(struct task_struct *tsk) 399 { 400 struct core_state *core_state; 401 402 /* 403 * Serialize with any possible pending coredump. 404 * We must hold siglock around checking core_state 405 * and setting PF_POSTCOREDUMP. The core-inducing thread 406 * will increment ->nr_threads for each thread in the 407 * group without PF_POSTCOREDUMP set. 408 */ 409 spin_lock_irq(&tsk->sighand->siglock); 410 tsk->flags |= PF_POSTCOREDUMP; 411 core_state = tsk->signal->core_state; 412 spin_unlock_irq(&tsk->sighand->siglock); 413 if (core_state) { 414 struct core_thread self; 415 416 self.task = current; 417 if (self.task->flags & PF_SIGNALED) 418 self.next = xchg(&core_state->dumper.next, &self); 419 else 420 self.task = NULL; 421 /* 422 * Implies mb(), the result of xchg() must be visible 423 * to core_state->dumper. 424 */ 425 if (atomic_dec_and_test(&core_state->nr_threads)) 426 complete(&core_state->startup); 427 428 for (;;) { 429 set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 430 if (!self.task) /* see coredump_finish() */ 431 break; 432 schedule(); 433 } 434 __set_current_state(TASK_RUNNING); 435 } 436 } 437 438 #ifdef CONFIG_MEMCG 439 /* 440 * A task is exiting. If it owned this mm, find a new owner for the mm. 441 */ 442 void mm_update_next_owner(struct mm_struct *mm) 443 { 444 struct task_struct *c, *g, *p = current; 445 446 retry: 447 /* 448 * If the exiting or execing task is not the owner, it's 449 * someone else's problem. 450 */ 451 if (mm->owner != p) 452 return; 453 /* 454 * The current owner is exiting/execing and there are no other 455 * candidates. Do not leave the mm pointing to a possibly 456 * freed task structure. 457 */ 458 if (atomic_read(&mm->mm_users) <= 1) { 459 WRITE_ONCE(mm->owner, NULL); 460 return; 461 } 462 463 read_lock(&tasklist_lock); 464 /* 465 * Search in the children 466 */ 467 list_for_each_entry(c, &p->children, sibling) { 468 if (c->mm == mm) 469 goto assign_new_owner; 470 } 471 472 /* 473 * Search in the siblings 474 */ 475 list_for_each_entry(c, &p->real_parent->children, sibling) { 476 if (c->mm == mm) 477 goto assign_new_owner; 478 } 479 480 /* 481 * Search through everything else, we should not get here often. 482 */ 483 for_each_process(g) { 484 if (g->flags & PF_KTHREAD) 485 continue; 486 for_each_thread(g, c) { 487 if (c->mm == mm) 488 goto assign_new_owner; 489 if (c->mm) 490 break; 491 } 492 } 493 read_unlock(&tasklist_lock); 494 /* 495 * We found no owner yet mm_users > 1: this implies that we are 496 * most likely racing with swapoff (try_to_unuse()) or /proc or 497 * ptrace or page migration (get_task_mm()). Mark owner as NULL. 498 */ 499 WRITE_ONCE(mm->owner, NULL); 500 return; 501 502 assign_new_owner: 503 BUG_ON(c == p); 504 get_task_struct(c); 505 /* 506 * The task_lock protects c->mm from changing. 507 * We always want mm->owner->mm == mm 508 */ 509 task_lock(c); 510 /* 511 * Delay read_unlock() till we have the task_lock() 512 * to ensure that c does not slip away underneath us 513 */ 514 read_unlock(&tasklist_lock); 515 if (c->mm != mm) { 516 task_unlock(c); 517 put_task_struct(c); 518 goto retry; 519 } 520 WRITE_ONCE(mm->owner, c); 521 lru_gen_migrate_mm(mm); 522 task_unlock(c); 523 put_task_struct(c); 524 } 525 #endif /* CONFIG_MEMCG */ 526 527 /* 528 * Turn us into a lazy TLB process if we 529 * aren't already.. 530 */ 531 static void exit_mm(void) 532 { 533 struct mm_struct *mm = current->mm; 534 535 exit_mm_release(current, mm); 536 if (!mm) 537 return; 538 sync_mm_rss(mm); 539 mmap_read_lock(mm); 540 mmgrab(mm); 541 BUG_ON(mm != current->active_mm); 542 /* more a memory barrier than a real lock */ 543 task_lock(current); 544 /* 545 * When a thread stops operating on an address space, the loop 546 * in membarrier_private_expedited() may not observe that 547 * tsk->mm, and the loop in membarrier_global_expedited() may 548 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED 549 * rq->membarrier_state, so those would not issue an IPI. 550 * Membarrier requires a memory barrier after accessing 551 * user-space memory, before clearing tsk->mm or the 552 * rq->membarrier_state. 553 */ 554 smp_mb__after_spinlock(); 555 local_irq_disable(); 556 current->mm = NULL; 557 membarrier_update_current_mm(NULL); 558 enter_lazy_tlb(mm, current); 559 local_irq_enable(); 560 task_unlock(current); 561 mmap_read_unlock(mm); 562 mm_update_next_owner(mm); 563 mmput(mm); 564 if (test_thread_flag(TIF_MEMDIE)) 565 exit_oom_victim(); 566 } 567 568 static struct task_struct *find_alive_thread(struct task_struct *p) 569 { 570 struct task_struct *t; 571 572 for_each_thread(p, t) { 573 if (!(t->flags & PF_EXITING)) 574 return t; 575 } 576 return NULL; 577 } 578 579 static struct task_struct *find_child_reaper(struct task_struct *father, 580 struct list_head *dead) 581 __releases(&tasklist_lock) 582 __acquires(&tasklist_lock) 583 { 584 struct pid_namespace *pid_ns = task_active_pid_ns(father); 585 struct task_struct *reaper = pid_ns->child_reaper; 586 struct task_struct *p, *n; 587 588 if (likely(reaper != father)) 589 return reaper; 590 591 reaper = find_alive_thread(father); 592 if (reaper) { 593 pid_ns->child_reaper = reaper; 594 return reaper; 595 } 596 597 write_unlock_irq(&tasklist_lock); 598 599 list_for_each_entry_safe(p, n, dead, ptrace_entry) { 600 list_del_init(&p->ptrace_entry); 601 release_task(p); 602 } 603 604 zap_pid_ns_processes(pid_ns); 605 write_lock_irq(&tasklist_lock); 606 607 return father; 608 } 609 610 /* 611 * When we die, we re-parent all our children, and try to: 612 * 1. give them to another thread in our thread group, if such a member exists 613 * 2. give it to the first ancestor process which prctl'd itself as a 614 * child_subreaper for its children (like a service manager) 615 * 3. give it to the init process (PID 1) in our pid namespace 616 */ 617 static struct task_struct *find_new_reaper(struct task_struct *father, 618 struct task_struct *child_reaper) 619 { 620 struct task_struct *thread, *reaper; 621 622 thread = find_alive_thread(father); 623 if (thread) 624 return thread; 625 626 if (father->signal->has_child_subreaper) { 627 unsigned int ns_level = task_pid(father)->level; 628 /* 629 * Find the first ->is_child_subreaper ancestor in our pid_ns. 630 * We can't check reaper != child_reaper to ensure we do not 631 * cross the namespaces, the exiting parent could be injected 632 * by setns() + fork(). 633 * We check pid->level, this is slightly more efficient than 634 * task_active_pid_ns(reaper) != task_active_pid_ns(father). 635 */ 636 for (reaper = father->real_parent; 637 task_pid(reaper)->level == ns_level; 638 reaper = reaper->real_parent) { 639 if (reaper == &init_task) 640 break; 641 if (!reaper->signal->is_child_subreaper) 642 continue; 643 thread = find_alive_thread(reaper); 644 if (thread) 645 return thread; 646 } 647 } 648 649 return child_reaper; 650 } 651 652 /* 653 * Any that need to be release_task'd are put on the @dead list. 654 */ 655 static void reparent_leader(struct task_struct *father, struct task_struct *p, 656 struct list_head *dead) 657 { 658 if (unlikely(p->exit_state == EXIT_DEAD)) 659 return; 660 661 /* We don't want people slaying init. */ 662 p->exit_signal = SIGCHLD; 663 664 /* If it has exited notify the new parent about this child's death. */ 665 if (!p->ptrace && 666 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 667 if (do_notify_parent(p, p->exit_signal)) { 668 p->exit_state = EXIT_DEAD; 669 list_add(&p->ptrace_entry, dead); 670 } 671 } 672 673 kill_orphaned_pgrp(p, father); 674 } 675 676 /* 677 * This does two things: 678 * 679 * A. Make init inherit all the child processes 680 * B. Check to see if any process groups have become orphaned 681 * as a result of our exiting, and if they have any stopped 682 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 683 */ 684 static void forget_original_parent(struct task_struct *father, 685 struct list_head *dead) 686 { 687 struct task_struct *p, *t, *reaper; 688 689 if (unlikely(!list_empty(&father->ptraced))) 690 exit_ptrace(father, dead); 691 692 /* Can drop and reacquire tasklist_lock */ 693 reaper = find_child_reaper(father, dead); 694 if (list_empty(&father->children)) 695 return; 696 697 reaper = find_new_reaper(father, reaper); 698 list_for_each_entry(p, &father->children, sibling) { 699 for_each_thread(p, t) { 700 RCU_INIT_POINTER(t->real_parent, reaper); 701 BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); 702 if (likely(!t->ptrace)) 703 t->parent = t->real_parent; 704 if (t->pdeath_signal) 705 group_send_sig_info(t->pdeath_signal, 706 SEND_SIG_NOINFO, t, 707 PIDTYPE_TGID); 708 } 709 /* 710 * If this is a threaded reparent there is no need to 711 * notify anyone anything has happened. 712 */ 713 if (!same_thread_group(reaper, father)) 714 reparent_leader(father, p, dead); 715 } 716 list_splice_tail_init(&father->children, &reaper->children); 717 } 718 719 /* 720 * Send signals to all our closest relatives so that they know 721 * to properly mourn us.. 722 */ 723 static void exit_notify(struct task_struct *tsk, int group_dead) 724 { 725 bool autoreap; 726 struct task_struct *p, *n; 727 LIST_HEAD(dead); 728 729 write_lock_irq(&tasklist_lock); 730 forget_original_parent(tsk, &dead); 731 732 if (group_dead) 733 kill_orphaned_pgrp(tsk->group_leader, NULL); 734 735 tsk->exit_state = EXIT_ZOMBIE; 736 if (unlikely(tsk->ptrace)) { 737 int sig = thread_group_leader(tsk) && 738 thread_group_empty(tsk) && 739 !ptrace_reparented(tsk) ? 740 tsk->exit_signal : SIGCHLD; 741 autoreap = do_notify_parent(tsk, sig); 742 } else if (thread_group_leader(tsk)) { 743 autoreap = thread_group_empty(tsk) && 744 do_notify_parent(tsk, tsk->exit_signal); 745 } else { 746 autoreap = true; 747 } 748 749 if (autoreap) { 750 tsk->exit_state = EXIT_DEAD; 751 list_add(&tsk->ptrace_entry, &dead); 752 } 753 754 /* mt-exec, de_thread() is waiting for group leader */ 755 if (unlikely(tsk->signal->notify_count < 0)) 756 wake_up_process(tsk->signal->group_exec_task); 757 write_unlock_irq(&tasklist_lock); 758 759 list_for_each_entry_safe(p, n, &dead, ptrace_entry) { 760 list_del_init(&p->ptrace_entry); 761 release_task(p); 762 } 763 } 764 765 #ifdef CONFIG_DEBUG_STACK_USAGE 766 static void check_stack_usage(void) 767 { 768 static DEFINE_SPINLOCK(low_water_lock); 769 static int lowest_to_date = THREAD_SIZE; 770 unsigned long free; 771 772 free = stack_not_used(current); 773 774 if (free >= lowest_to_date) 775 return; 776 777 spin_lock(&low_water_lock); 778 if (free < lowest_to_date) { 779 pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", 780 current->comm, task_pid_nr(current), free); 781 lowest_to_date = free; 782 } 783 spin_unlock(&low_water_lock); 784 } 785 #else 786 static inline void check_stack_usage(void) {} 787 #endif 788 789 static void synchronize_group_exit(struct task_struct *tsk, long code) 790 { 791 struct sighand_struct *sighand = tsk->sighand; 792 struct signal_struct *signal = tsk->signal; 793 794 spin_lock_irq(&sighand->siglock); 795 signal->quick_threads--; 796 if ((signal->quick_threads == 0) && 797 !(signal->flags & SIGNAL_GROUP_EXIT)) { 798 signal->flags = SIGNAL_GROUP_EXIT; 799 signal->group_exit_code = code; 800 signal->group_stop_count = 0; 801 } 802 spin_unlock_irq(&sighand->siglock); 803 } 804 805 void __noreturn do_exit(long code) 806 { 807 struct task_struct *tsk = current; 808 int group_dead; 809 810 synchronize_group_exit(tsk, code); 811 812 WARN_ON(tsk->plug); 813 814 kcov_task_exit(tsk); 815 kmsan_task_exit(tsk); 816 817 coredump_task_exit(tsk); 818 ptrace_event(PTRACE_EVENT_EXIT, code); 819 820 validate_creds_for_do_exit(tsk); 821 822 io_uring_files_cancel(); 823 exit_signals(tsk); /* sets PF_EXITING */ 824 825 /* sync mm's RSS info before statistics gathering */ 826 if (tsk->mm) 827 sync_mm_rss(tsk->mm); 828 acct_update_integrals(tsk); 829 group_dead = atomic_dec_and_test(&tsk->signal->live); 830 if (group_dead) { 831 /* 832 * If the last thread of global init has exited, panic 833 * immediately to get a useable coredump. 834 */ 835 if (unlikely(is_global_init(tsk))) 836 panic("Attempted to kill init! exitcode=0x%08x\n", 837 tsk->signal->group_exit_code ?: (int)code); 838 839 #ifdef CONFIG_POSIX_TIMERS 840 hrtimer_cancel(&tsk->signal->real_timer); 841 exit_itimers(tsk); 842 #endif 843 if (tsk->mm) 844 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); 845 } 846 acct_collect(code, group_dead); 847 if (group_dead) 848 tty_audit_exit(); 849 audit_free(tsk); 850 851 tsk->exit_code = code; 852 taskstats_exit(tsk, group_dead); 853 854 exit_mm(); 855 856 if (group_dead) 857 acct_process(); 858 trace_sched_process_exit(tsk); 859 860 exit_sem(tsk); 861 exit_shm(tsk); 862 exit_files(tsk); 863 exit_fs(tsk); 864 if (group_dead) 865 disassociate_ctty(1); 866 exit_task_namespaces(tsk); 867 exit_task_work(tsk); 868 exit_thread(tsk); 869 870 /* 871 * Flush inherited counters to the parent - before the parent 872 * gets woken up by child-exit notifications. 873 * 874 * because of cgroup mode, must be called before cgroup_exit() 875 */ 876 perf_event_exit_task(tsk); 877 878 sched_autogroup_exit_task(tsk); 879 cgroup_exit(tsk); 880 881 /* 882 * FIXME: do that only when needed, using sched_exit tracepoint 883 */ 884 flush_ptrace_hw_breakpoint(tsk); 885 886 exit_tasks_rcu_start(); 887 exit_notify(tsk, group_dead); 888 proc_exit_connector(tsk); 889 mpol_put_task_policy(tsk); 890 #ifdef CONFIG_FUTEX 891 if (unlikely(current->pi_state_cache)) 892 kfree(current->pi_state_cache); 893 #endif 894 /* 895 * Make sure we are holding no locks: 896 */ 897 debug_check_no_locks_held(); 898 899 if (tsk->io_context) 900 exit_io_context(tsk); 901 902 if (tsk->splice_pipe) 903 free_pipe_info(tsk->splice_pipe); 904 905 if (tsk->task_frag.page) 906 put_page(tsk->task_frag.page); 907 908 validate_creds_for_do_exit(tsk); 909 exit_task_stack_account(tsk); 910 911 check_stack_usage(); 912 preempt_disable(); 913 if (tsk->nr_dirtied) 914 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 915 exit_rcu(); 916 exit_tasks_rcu_finish(); 917 918 lockdep_free_task(tsk); 919 do_task_dead(); 920 } 921 922 void __noreturn make_task_dead(int signr) 923 { 924 /* 925 * Take the task off the cpu after something catastrophic has 926 * happened. 927 * 928 * We can get here from a kernel oops, sometimes with preemption off. 929 * Start by checking for critical errors. 930 * Then fix up important state like USER_DS and preemption. 931 * Then do everything else. 932 */ 933 struct task_struct *tsk = current; 934 935 if (unlikely(in_interrupt())) 936 panic("Aiee, killing interrupt handler!"); 937 if (unlikely(!tsk->pid)) 938 panic("Attempted to kill the idle task!"); 939 940 if (unlikely(in_atomic())) { 941 pr_info("note: %s[%d] exited with preempt_count %d\n", 942 current->comm, task_pid_nr(current), 943 preempt_count()); 944 preempt_count_set(PREEMPT_ENABLED); 945 } 946 947 /* 948 * Every time the system oopses, if the oops happens while a reference 949 * to an object was held, the reference leaks. 950 * If the oops doesn't also leak memory, repeated oopsing can cause 951 * reference counters to wrap around (if they're not using refcount_t). 952 * This means that repeated oopsing can make unexploitable-looking bugs 953 * exploitable through repeated oopsing. 954 * To make sure this can't happen, place an upper bound on how often the 955 * kernel may oops without panic(). 956 */ 957 if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit) && oops_limit) 958 panic("Oopsed too often (kernel.oops_limit is %d)", oops_limit); 959 960 /* 961 * We're taking recursive faults here in make_task_dead. Safest is to just 962 * leave this task alone and wait for reboot. 963 */ 964 if (unlikely(tsk->flags & PF_EXITING)) { 965 pr_alert("Fixing recursive fault but reboot is needed!\n"); 966 futex_exit_recursive(tsk); 967 tsk->exit_state = EXIT_DEAD; 968 refcount_inc(&tsk->rcu_users); 969 do_task_dead(); 970 } 971 972 do_exit(signr); 973 } 974 975 SYSCALL_DEFINE1(exit, int, error_code) 976 { 977 do_exit((error_code&0xff)<<8); 978 } 979 980 /* 981 * Take down every thread in the group. This is called by fatal signals 982 * as well as by sys_exit_group (below). 983 */ 984 void __noreturn 985 do_group_exit(int exit_code) 986 { 987 struct signal_struct *sig = current->signal; 988 989 if (sig->flags & SIGNAL_GROUP_EXIT) 990 exit_code = sig->group_exit_code; 991 else if (sig->group_exec_task) 992 exit_code = 0; 993 else { 994 struct sighand_struct *const sighand = current->sighand; 995 996 spin_lock_irq(&sighand->siglock); 997 if (sig->flags & SIGNAL_GROUP_EXIT) 998 /* Another thread got here before we took the lock. */ 999 exit_code = sig->group_exit_code; 1000 else if (sig->group_exec_task) 1001 exit_code = 0; 1002 else { 1003 sig->group_exit_code = exit_code; 1004 sig->flags = SIGNAL_GROUP_EXIT; 1005 zap_other_threads(current); 1006 } 1007 spin_unlock_irq(&sighand->siglock); 1008 } 1009 1010 do_exit(exit_code); 1011 /* NOTREACHED */ 1012 } 1013 1014 /* 1015 * this kills every thread in the thread group. Note that any externally 1016 * wait4()-ing process will get the correct exit code - even if this 1017 * thread is not the thread group leader. 1018 */ 1019 SYSCALL_DEFINE1(exit_group, int, error_code) 1020 { 1021 do_group_exit((error_code & 0xff) << 8); 1022 /* NOTREACHED */ 1023 return 0; 1024 } 1025 1026 struct waitid_info { 1027 pid_t pid; 1028 uid_t uid; 1029 int status; 1030 int cause; 1031 }; 1032 1033 struct wait_opts { 1034 enum pid_type wo_type; 1035 int wo_flags; 1036 struct pid *wo_pid; 1037 1038 struct waitid_info *wo_info; 1039 int wo_stat; 1040 struct rusage *wo_rusage; 1041 1042 wait_queue_entry_t child_wait; 1043 int notask_error; 1044 }; 1045 1046 static int eligible_pid(struct wait_opts *wo, struct task_struct *p) 1047 { 1048 return wo->wo_type == PIDTYPE_MAX || 1049 task_pid_type(p, wo->wo_type) == wo->wo_pid; 1050 } 1051 1052 static int 1053 eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) 1054 { 1055 if (!eligible_pid(wo, p)) 1056 return 0; 1057 1058 /* 1059 * Wait for all children (clone and not) if __WALL is set or 1060 * if it is traced by us. 1061 */ 1062 if (ptrace || (wo->wo_flags & __WALL)) 1063 return 1; 1064 1065 /* 1066 * Otherwise, wait for clone children *only* if __WCLONE is set; 1067 * otherwise, wait for non-clone children *only*. 1068 * 1069 * Note: a "clone" child here is one that reports to its parent 1070 * using a signal other than SIGCHLD, or a non-leader thread which 1071 * we can only see if it is traced by us. 1072 */ 1073 if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) 1074 return 0; 1075 1076 return 1; 1077 } 1078 1079 /* 1080 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1081 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1082 * the lock and this task is uninteresting. If we return nonzero, we have 1083 * released the lock and the system call should return. 1084 */ 1085 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1086 { 1087 int state, status; 1088 pid_t pid = task_pid_vnr(p); 1089 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1090 struct waitid_info *infop; 1091 1092 if (!likely(wo->wo_flags & WEXITED)) 1093 return 0; 1094 1095 if (unlikely(wo->wo_flags & WNOWAIT)) { 1096 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1097 ? p->signal->group_exit_code : p->exit_code; 1098 get_task_struct(p); 1099 read_unlock(&tasklist_lock); 1100 sched_annotate_sleep(); 1101 if (wo->wo_rusage) 1102 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1103 put_task_struct(p); 1104 goto out_info; 1105 } 1106 /* 1107 * Move the task's state to DEAD/TRACE, only one thread can do this. 1108 */ 1109 state = (ptrace_reparented(p) && thread_group_leader(p)) ? 1110 EXIT_TRACE : EXIT_DEAD; 1111 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1112 return 0; 1113 /* 1114 * We own this thread, nobody else can reap it. 1115 */ 1116 read_unlock(&tasklist_lock); 1117 sched_annotate_sleep(); 1118 1119 /* 1120 * Check thread_group_leader() to exclude the traced sub-threads. 1121 */ 1122 if (state == EXIT_DEAD && thread_group_leader(p)) { 1123 struct signal_struct *sig = p->signal; 1124 struct signal_struct *psig = current->signal; 1125 unsigned long maxrss; 1126 u64 tgutime, tgstime; 1127 1128 /* 1129 * The resource counters for the group leader are in its 1130 * own task_struct. Those for dead threads in the group 1131 * are in its signal_struct, as are those for the child 1132 * processes it has previously reaped. All these 1133 * accumulate in the parent's signal_struct c* fields. 1134 * 1135 * We don't bother to take a lock here to protect these 1136 * p->signal fields because the whole thread group is dead 1137 * and nobody can change them. 1138 * 1139 * psig->stats_lock also protects us from our sub-threads 1140 * which can reap other children at the same time. Until 1141 * we change k_getrusage()-like users to rely on this lock 1142 * we have to take ->siglock as well. 1143 * 1144 * We use thread_group_cputime_adjusted() to get times for 1145 * the thread group, which consolidates times for all threads 1146 * in the group including the group leader. 1147 */ 1148 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1149 spin_lock_irq(¤t->sighand->siglock); 1150 write_seqlock(&psig->stats_lock); 1151 psig->cutime += tgutime + sig->cutime; 1152 psig->cstime += tgstime + sig->cstime; 1153 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1154 psig->cmin_flt += 1155 p->min_flt + sig->min_flt + sig->cmin_flt; 1156 psig->cmaj_flt += 1157 p->maj_flt + sig->maj_flt + sig->cmaj_flt; 1158 psig->cnvcsw += 1159 p->nvcsw + sig->nvcsw + sig->cnvcsw; 1160 psig->cnivcsw += 1161 p->nivcsw + sig->nivcsw + sig->cnivcsw; 1162 psig->cinblock += 1163 task_io_get_inblock(p) + 1164 sig->inblock + sig->cinblock; 1165 psig->coublock += 1166 task_io_get_oublock(p) + 1167 sig->oublock + sig->coublock; 1168 maxrss = max(sig->maxrss, sig->cmaxrss); 1169 if (psig->cmaxrss < maxrss) 1170 psig->cmaxrss = maxrss; 1171 task_io_accounting_add(&psig->ioac, &p->ioac); 1172 task_io_accounting_add(&psig->ioac, &sig->ioac); 1173 write_sequnlock(&psig->stats_lock); 1174 spin_unlock_irq(¤t->sighand->siglock); 1175 } 1176 1177 if (wo->wo_rusage) 1178 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1179 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1180 ? p->signal->group_exit_code : p->exit_code; 1181 wo->wo_stat = status; 1182 1183 if (state == EXIT_TRACE) { 1184 write_lock_irq(&tasklist_lock); 1185 /* We dropped tasklist, ptracer could die and untrace */ 1186 ptrace_unlink(p); 1187 1188 /* If parent wants a zombie, don't release it now */ 1189 state = EXIT_ZOMBIE; 1190 if (do_notify_parent(p, p->exit_signal)) 1191 state = EXIT_DEAD; 1192 p->exit_state = state; 1193 write_unlock_irq(&tasklist_lock); 1194 } 1195 if (state == EXIT_DEAD) 1196 release_task(p); 1197 1198 out_info: 1199 infop = wo->wo_info; 1200 if (infop) { 1201 if ((status & 0x7f) == 0) { 1202 infop->cause = CLD_EXITED; 1203 infop->status = status >> 8; 1204 } else { 1205 infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; 1206 infop->status = status & 0x7f; 1207 } 1208 infop->pid = pid; 1209 infop->uid = uid; 1210 } 1211 1212 return pid; 1213 } 1214 1215 static int *task_stopped_code(struct task_struct *p, bool ptrace) 1216 { 1217 if (ptrace) { 1218 if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) 1219 return &p->exit_code; 1220 } else { 1221 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1222 return &p->signal->group_exit_code; 1223 } 1224 return NULL; 1225 } 1226 1227 /** 1228 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED 1229 * @wo: wait options 1230 * @ptrace: is the wait for ptrace 1231 * @p: task to wait for 1232 * 1233 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. 1234 * 1235 * CONTEXT: 1236 * read_lock(&tasklist_lock), which is released if return value is 1237 * non-zero. Also, grabs and releases @p->sighand->siglock. 1238 * 1239 * RETURNS: 1240 * 0 if wait condition didn't exist and search for other wait conditions 1241 * should continue. Non-zero return, -errno on failure and @p's pid on 1242 * success, implies that tasklist_lock is released and wait condition 1243 * search should terminate. 1244 */ 1245 static int wait_task_stopped(struct wait_opts *wo, 1246 int ptrace, struct task_struct *p) 1247 { 1248 struct waitid_info *infop; 1249 int exit_code, *p_code, why; 1250 uid_t uid = 0; /* unneeded, required by compiler */ 1251 pid_t pid; 1252 1253 /* 1254 * Traditionally we see ptrace'd stopped tasks regardless of options. 1255 */ 1256 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1257 return 0; 1258 1259 if (!task_stopped_code(p, ptrace)) 1260 return 0; 1261 1262 exit_code = 0; 1263 spin_lock_irq(&p->sighand->siglock); 1264 1265 p_code = task_stopped_code(p, ptrace); 1266 if (unlikely(!p_code)) 1267 goto unlock_sig; 1268 1269 exit_code = *p_code; 1270 if (!exit_code) 1271 goto unlock_sig; 1272 1273 if (!unlikely(wo->wo_flags & WNOWAIT)) 1274 *p_code = 0; 1275 1276 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1277 unlock_sig: 1278 spin_unlock_irq(&p->sighand->siglock); 1279 if (!exit_code) 1280 return 0; 1281 1282 /* 1283 * Now we are pretty sure this task is interesting. 1284 * Make sure it doesn't get reaped out from under us while we 1285 * give up the lock and then examine it below. We don't want to 1286 * keep holding onto the tasklist_lock while we call getrusage and 1287 * possibly take page faults for user memory. 1288 */ 1289 get_task_struct(p); 1290 pid = task_pid_vnr(p); 1291 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1292 read_unlock(&tasklist_lock); 1293 sched_annotate_sleep(); 1294 if (wo->wo_rusage) 1295 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1296 put_task_struct(p); 1297 1298 if (likely(!(wo->wo_flags & WNOWAIT))) 1299 wo->wo_stat = (exit_code << 8) | 0x7f; 1300 1301 infop = wo->wo_info; 1302 if (infop) { 1303 infop->cause = why; 1304 infop->status = exit_code; 1305 infop->pid = pid; 1306 infop->uid = uid; 1307 } 1308 return pid; 1309 } 1310 1311 /* 1312 * Handle do_wait work for one task in a live, non-stopped state. 1313 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1314 * the lock and this task is uninteresting. If we return nonzero, we have 1315 * released the lock and the system call should return. 1316 */ 1317 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1318 { 1319 struct waitid_info *infop; 1320 pid_t pid; 1321 uid_t uid; 1322 1323 if (!unlikely(wo->wo_flags & WCONTINUED)) 1324 return 0; 1325 1326 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1327 return 0; 1328 1329 spin_lock_irq(&p->sighand->siglock); 1330 /* Re-check with the lock held. */ 1331 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { 1332 spin_unlock_irq(&p->sighand->siglock); 1333 return 0; 1334 } 1335 if (!unlikely(wo->wo_flags & WNOWAIT)) 1336 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1337 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1338 spin_unlock_irq(&p->sighand->siglock); 1339 1340 pid = task_pid_vnr(p); 1341 get_task_struct(p); 1342 read_unlock(&tasklist_lock); 1343 sched_annotate_sleep(); 1344 if (wo->wo_rusage) 1345 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1346 put_task_struct(p); 1347 1348 infop = wo->wo_info; 1349 if (!infop) { 1350 wo->wo_stat = 0xffff; 1351 } else { 1352 infop->cause = CLD_CONTINUED; 1353 infop->pid = pid; 1354 infop->uid = uid; 1355 infop->status = SIGCONT; 1356 } 1357 return pid; 1358 } 1359 1360 /* 1361 * Consider @p for a wait by @parent. 1362 * 1363 * -ECHILD should be in ->notask_error before the first call. 1364 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1365 * Returns zero if the search for a child should continue; 1366 * then ->notask_error is 0 if @p is an eligible child, 1367 * or still -ECHILD. 1368 */ 1369 static int wait_consider_task(struct wait_opts *wo, int ptrace, 1370 struct task_struct *p) 1371 { 1372 /* 1373 * We can race with wait_task_zombie() from another thread. 1374 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition 1375 * can't confuse the checks below. 1376 */ 1377 int exit_state = READ_ONCE(p->exit_state); 1378 int ret; 1379 1380 if (unlikely(exit_state == EXIT_DEAD)) 1381 return 0; 1382 1383 ret = eligible_child(wo, ptrace, p); 1384 if (!ret) 1385 return ret; 1386 1387 if (unlikely(exit_state == EXIT_TRACE)) { 1388 /* 1389 * ptrace == 0 means we are the natural parent. In this case 1390 * we should clear notask_error, debugger will notify us. 1391 */ 1392 if (likely(!ptrace)) 1393 wo->notask_error = 0; 1394 return 0; 1395 } 1396 1397 if (likely(!ptrace) && unlikely(p->ptrace)) { 1398 /* 1399 * If it is traced by its real parent's group, just pretend 1400 * the caller is ptrace_do_wait() and reap this child if it 1401 * is zombie. 1402 * 1403 * This also hides group stop state from real parent; otherwise 1404 * a single stop can be reported twice as group and ptrace stop. 1405 * If a ptracer wants to distinguish these two events for its 1406 * own children it should create a separate process which takes 1407 * the role of real parent. 1408 */ 1409 if (!ptrace_reparented(p)) 1410 ptrace = 1; 1411 } 1412 1413 /* slay zombie? */ 1414 if (exit_state == EXIT_ZOMBIE) { 1415 /* we don't reap group leaders with subthreads */ 1416 if (!delay_group_leader(p)) { 1417 /* 1418 * A zombie ptracee is only visible to its ptracer. 1419 * Notification and reaping will be cascaded to the 1420 * real parent when the ptracer detaches. 1421 */ 1422 if (unlikely(ptrace) || likely(!p->ptrace)) 1423 return wait_task_zombie(wo, p); 1424 } 1425 1426 /* 1427 * Allow access to stopped/continued state via zombie by 1428 * falling through. Clearing of notask_error is complex. 1429 * 1430 * When !@ptrace: 1431 * 1432 * If WEXITED is set, notask_error should naturally be 1433 * cleared. If not, subset of WSTOPPED|WCONTINUED is set, 1434 * so, if there are live subthreads, there are events to 1435 * wait for. If all subthreads are dead, it's still safe 1436 * to clear - this function will be called again in finite 1437 * amount time once all the subthreads are released and 1438 * will then return without clearing. 1439 * 1440 * When @ptrace: 1441 * 1442 * Stopped state is per-task and thus can't change once the 1443 * target task dies. Only continued and exited can happen. 1444 * Clear notask_error if WCONTINUED | WEXITED. 1445 */ 1446 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) 1447 wo->notask_error = 0; 1448 } else { 1449 /* 1450 * @p is alive and it's gonna stop, continue or exit, so 1451 * there always is something to wait for. 1452 */ 1453 wo->notask_error = 0; 1454 } 1455 1456 /* 1457 * Wait for stopped. Depending on @ptrace, different stopped state 1458 * is used and the two don't interact with each other. 1459 */ 1460 ret = wait_task_stopped(wo, ptrace, p); 1461 if (ret) 1462 return ret; 1463 1464 /* 1465 * Wait for continued. There's only one continued state and the 1466 * ptracer can consume it which can confuse the real parent. Don't 1467 * use WCONTINUED from ptracer. You don't need or want it. 1468 */ 1469 return wait_task_continued(wo, p); 1470 } 1471 1472 /* 1473 * Do the work of do_wait() for one thread in the group, @tsk. 1474 * 1475 * -ECHILD should be in ->notask_error before the first call. 1476 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1477 * Returns zero if the search for a child should continue; then 1478 * ->notask_error is 0 if there were any eligible children, 1479 * or still -ECHILD. 1480 */ 1481 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) 1482 { 1483 struct task_struct *p; 1484 1485 list_for_each_entry(p, &tsk->children, sibling) { 1486 int ret = wait_consider_task(wo, 0, p); 1487 1488 if (ret) 1489 return ret; 1490 } 1491 1492 return 0; 1493 } 1494 1495 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) 1496 { 1497 struct task_struct *p; 1498 1499 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1500 int ret = wait_consider_task(wo, 1, p); 1501 1502 if (ret) 1503 return ret; 1504 } 1505 1506 return 0; 1507 } 1508 1509 static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, 1510 int sync, void *key) 1511 { 1512 struct wait_opts *wo = container_of(wait, struct wait_opts, 1513 child_wait); 1514 struct task_struct *p = key; 1515 1516 if (!eligible_pid(wo, p)) 1517 return 0; 1518 1519 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) 1520 return 0; 1521 1522 return default_wake_function(wait, mode, sync, key); 1523 } 1524 1525 void __wake_up_parent(struct task_struct *p, struct task_struct *parent) 1526 { 1527 __wake_up_sync_key(&parent->signal->wait_chldexit, 1528 TASK_INTERRUPTIBLE, p); 1529 } 1530 1531 static bool is_effectively_child(struct wait_opts *wo, bool ptrace, 1532 struct task_struct *target) 1533 { 1534 struct task_struct *parent = 1535 !ptrace ? target->real_parent : target->parent; 1536 1537 return current == parent || (!(wo->wo_flags & __WNOTHREAD) && 1538 same_thread_group(current, parent)); 1539 } 1540 1541 /* 1542 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child 1543 * and tracee lists to find the target task. 1544 */ 1545 static int do_wait_pid(struct wait_opts *wo) 1546 { 1547 bool ptrace; 1548 struct task_struct *target; 1549 int retval; 1550 1551 ptrace = false; 1552 target = pid_task(wo->wo_pid, PIDTYPE_TGID); 1553 if (target && is_effectively_child(wo, ptrace, target)) { 1554 retval = wait_consider_task(wo, ptrace, target); 1555 if (retval) 1556 return retval; 1557 } 1558 1559 ptrace = true; 1560 target = pid_task(wo->wo_pid, PIDTYPE_PID); 1561 if (target && target->ptrace && 1562 is_effectively_child(wo, ptrace, target)) { 1563 retval = wait_consider_task(wo, ptrace, target); 1564 if (retval) 1565 return retval; 1566 } 1567 1568 return 0; 1569 } 1570 1571 static long do_wait(struct wait_opts *wo) 1572 { 1573 int retval; 1574 1575 trace_sched_process_wait(wo->wo_pid); 1576 1577 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); 1578 wo->child_wait.private = current; 1579 add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1580 repeat: 1581 /* 1582 * If there is nothing that can match our criteria, just get out. 1583 * We will clear ->notask_error to zero if we see any child that 1584 * might later match our criteria, even if we are not able to reap 1585 * it yet. 1586 */ 1587 wo->notask_error = -ECHILD; 1588 if ((wo->wo_type < PIDTYPE_MAX) && 1589 (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) 1590 goto notask; 1591 1592 set_current_state(TASK_INTERRUPTIBLE); 1593 read_lock(&tasklist_lock); 1594 1595 if (wo->wo_type == PIDTYPE_PID) { 1596 retval = do_wait_pid(wo); 1597 if (retval) 1598 goto end; 1599 } else { 1600 struct task_struct *tsk = current; 1601 1602 do { 1603 retval = do_wait_thread(wo, tsk); 1604 if (retval) 1605 goto end; 1606 1607 retval = ptrace_do_wait(wo, tsk); 1608 if (retval) 1609 goto end; 1610 1611 if (wo->wo_flags & __WNOTHREAD) 1612 break; 1613 } while_each_thread(current, tsk); 1614 } 1615 read_unlock(&tasklist_lock); 1616 1617 notask: 1618 retval = wo->notask_error; 1619 if (!retval && !(wo->wo_flags & WNOHANG)) { 1620 retval = -ERESTARTSYS; 1621 if (!signal_pending(current)) { 1622 schedule(); 1623 goto repeat; 1624 } 1625 } 1626 end: 1627 __set_current_state(TASK_RUNNING); 1628 remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1629 return retval; 1630 } 1631 1632 static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, 1633 int options, struct rusage *ru) 1634 { 1635 struct wait_opts wo; 1636 struct pid *pid = NULL; 1637 enum pid_type type; 1638 long ret; 1639 unsigned int f_flags = 0; 1640 1641 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| 1642 __WNOTHREAD|__WCLONE|__WALL)) 1643 return -EINVAL; 1644 if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) 1645 return -EINVAL; 1646 1647 switch (which) { 1648 case P_ALL: 1649 type = PIDTYPE_MAX; 1650 break; 1651 case P_PID: 1652 type = PIDTYPE_PID; 1653 if (upid <= 0) 1654 return -EINVAL; 1655 1656 pid = find_get_pid(upid); 1657 break; 1658 case P_PGID: 1659 type = PIDTYPE_PGID; 1660 if (upid < 0) 1661 return -EINVAL; 1662 1663 if (upid) 1664 pid = find_get_pid(upid); 1665 else 1666 pid = get_task_pid(current, PIDTYPE_PGID); 1667 break; 1668 case P_PIDFD: 1669 type = PIDTYPE_PID; 1670 if (upid < 0) 1671 return -EINVAL; 1672 1673 pid = pidfd_get_pid(upid, &f_flags); 1674 if (IS_ERR(pid)) 1675 return PTR_ERR(pid); 1676 1677 break; 1678 default: 1679 return -EINVAL; 1680 } 1681 1682 wo.wo_type = type; 1683 wo.wo_pid = pid; 1684 wo.wo_flags = options; 1685 wo.wo_info = infop; 1686 wo.wo_rusage = ru; 1687 if (f_flags & O_NONBLOCK) 1688 wo.wo_flags |= WNOHANG; 1689 1690 ret = do_wait(&wo); 1691 if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) 1692 ret = -EAGAIN; 1693 1694 put_pid(pid); 1695 return ret; 1696 } 1697 1698 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1699 infop, int, options, struct rusage __user *, ru) 1700 { 1701 struct rusage r; 1702 struct waitid_info info = {.status = 0}; 1703 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); 1704 int signo = 0; 1705 1706 if (err > 0) { 1707 signo = SIGCHLD; 1708 err = 0; 1709 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1710 return -EFAULT; 1711 } 1712 if (!infop) 1713 return err; 1714 1715 if (!user_write_access_begin(infop, sizeof(*infop))) 1716 return -EFAULT; 1717 1718 unsafe_put_user(signo, &infop->si_signo, Efault); 1719 unsafe_put_user(0, &infop->si_errno, Efault); 1720 unsafe_put_user(info.cause, &infop->si_code, Efault); 1721 unsafe_put_user(info.pid, &infop->si_pid, Efault); 1722 unsafe_put_user(info.uid, &infop->si_uid, Efault); 1723 unsafe_put_user(info.status, &infop->si_status, Efault); 1724 user_write_access_end(); 1725 return err; 1726 Efault: 1727 user_write_access_end(); 1728 return -EFAULT; 1729 } 1730 1731 long kernel_wait4(pid_t upid, int __user *stat_addr, int options, 1732 struct rusage *ru) 1733 { 1734 struct wait_opts wo; 1735 struct pid *pid = NULL; 1736 enum pid_type type; 1737 long ret; 1738 1739 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| 1740 __WNOTHREAD|__WCLONE|__WALL)) 1741 return -EINVAL; 1742 1743 /* -INT_MIN is not defined */ 1744 if (upid == INT_MIN) 1745 return -ESRCH; 1746 1747 if (upid == -1) 1748 type = PIDTYPE_MAX; 1749 else if (upid < 0) { 1750 type = PIDTYPE_PGID; 1751 pid = find_get_pid(-upid); 1752 } else if (upid == 0) { 1753 type = PIDTYPE_PGID; 1754 pid = get_task_pid(current, PIDTYPE_PGID); 1755 } else /* upid > 0 */ { 1756 type = PIDTYPE_PID; 1757 pid = find_get_pid(upid); 1758 } 1759 1760 wo.wo_type = type; 1761 wo.wo_pid = pid; 1762 wo.wo_flags = options | WEXITED; 1763 wo.wo_info = NULL; 1764 wo.wo_stat = 0; 1765 wo.wo_rusage = ru; 1766 ret = do_wait(&wo); 1767 put_pid(pid); 1768 if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) 1769 ret = -EFAULT; 1770 1771 return ret; 1772 } 1773 1774 int kernel_wait(pid_t pid, int *stat) 1775 { 1776 struct wait_opts wo = { 1777 .wo_type = PIDTYPE_PID, 1778 .wo_pid = find_get_pid(pid), 1779 .wo_flags = WEXITED, 1780 }; 1781 int ret; 1782 1783 ret = do_wait(&wo); 1784 if (ret > 0 && wo.wo_stat) 1785 *stat = wo.wo_stat; 1786 put_pid(wo.wo_pid); 1787 return ret; 1788 } 1789 1790 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1791 int, options, struct rusage __user *, ru) 1792 { 1793 struct rusage r; 1794 long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); 1795 1796 if (err > 0) { 1797 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1798 return -EFAULT; 1799 } 1800 return err; 1801 } 1802 1803 #ifdef __ARCH_WANT_SYS_WAITPID 1804 1805 /* 1806 * sys_waitpid() remains for compatibility. waitpid() should be 1807 * implemented by calling sys_wait4() from libc.a. 1808 */ 1809 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) 1810 { 1811 return kernel_wait4(pid, stat_addr, options, NULL); 1812 } 1813 1814 #endif 1815 1816 #ifdef CONFIG_COMPAT 1817 COMPAT_SYSCALL_DEFINE4(wait4, 1818 compat_pid_t, pid, 1819 compat_uint_t __user *, stat_addr, 1820 int, options, 1821 struct compat_rusage __user *, ru) 1822 { 1823 struct rusage r; 1824 long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); 1825 if (err > 0) { 1826 if (ru && put_compat_rusage(&r, ru)) 1827 return -EFAULT; 1828 } 1829 return err; 1830 } 1831 1832 COMPAT_SYSCALL_DEFINE5(waitid, 1833 int, which, compat_pid_t, pid, 1834 struct compat_siginfo __user *, infop, int, options, 1835 struct compat_rusage __user *, uru) 1836 { 1837 struct rusage ru; 1838 struct waitid_info info = {.status = 0}; 1839 long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); 1840 int signo = 0; 1841 if (err > 0) { 1842 signo = SIGCHLD; 1843 err = 0; 1844 if (uru) { 1845 /* kernel_waitid() overwrites everything in ru */ 1846 if (COMPAT_USE_64BIT_TIME) 1847 err = copy_to_user(uru, &ru, sizeof(ru)); 1848 else 1849 err = put_compat_rusage(&ru, uru); 1850 if (err) 1851 return -EFAULT; 1852 } 1853 } 1854 1855 if (!infop) 1856 return err; 1857 1858 if (!user_write_access_begin(infop, sizeof(*infop))) 1859 return -EFAULT; 1860 1861 unsafe_put_user(signo, &infop->si_signo, Efault); 1862 unsafe_put_user(0, &infop->si_errno, Efault); 1863 unsafe_put_user(info.cause, &infop->si_code, Efault); 1864 unsafe_put_user(info.pid, &infop->si_pid, Efault); 1865 unsafe_put_user(info.uid, &infop->si_uid, Efault); 1866 unsafe_put_user(info.status, &infop->si_status, Efault); 1867 user_write_access_end(); 1868 return err; 1869 Efault: 1870 user_write_access_end(); 1871 return -EFAULT; 1872 } 1873 #endif 1874 1875 /** 1876 * thread_group_exited - check that a thread group has exited 1877 * @pid: tgid of thread group to be checked. 1878 * 1879 * Test if the thread group represented by tgid has exited (all 1880 * threads are zombies, dead or completely gone). 1881 * 1882 * Return: true if the thread group has exited. false otherwise. 1883 */ 1884 bool thread_group_exited(struct pid *pid) 1885 { 1886 struct task_struct *task; 1887 bool exited; 1888 1889 rcu_read_lock(); 1890 task = pid_task(pid, PIDTYPE_PID); 1891 exited = !task || 1892 (READ_ONCE(task->exit_state) && thread_group_empty(task)); 1893 rcu_read_unlock(); 1894 1895 return exited; 1896 } 1897 EXPORT_SYMBOL(thread_group_exited); 1898 1899 __weak void abort(void) 1900 { 1901 BUG(); 1902 1903 /* if that doesn't kill us, halt */ 1904 panic("Oops failed to kill thread"); 1905 } 1906 EXPORT_SYMBOL(abort); 1907