1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/kernel/exit.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/slab.h> 10 #include <linux/sched/autogroup.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/stat.h> 13 #include <linux/sched/task.h> 14 #include <linux/sched/task_stack.h> 15 #include <linux/sched/cputime.h> 16 #include <linux/interrupt.h> 17 #include <linux/module.h> 18 #include <linux/capability.h> 19 #include <linux/completion.h> 20 #include <linux/personality.h> 21 #include <linux/tty.h> 22 #include <linux/iocontext.h> 23 #include <linux/key.h> 24 #include <linux/cpu.h> 25 #include <linux/acct.h> 26 #include <linux/tsacct_kern.h> 27 #include <linux/file.h> 28 #include <linux/fdtable.h> 29 #include <linux/freezer.h> 30 #include <linux/binfmts.h> 31 #include <linux/nsproxy.h> 32 #include <linux/pid_namespace.h> 33 #include <linux/ptrace.h> 34 #include <linux/profile.h> 35 #include <linux/mount.h> 36 #include <linux/proc_fs.h> 37 #include <linux/kthread.h> 38 #include <linux/mempolicy.h> 39 #include <linux/taskstats_kern.h> 40 #include <linux/delayacct.h> 41 #include <linux/cgroup.h> 42 #include <linux/syscalls.h> 43 #include <linux/signal.h> 44 #include <linux/posix-timers.h> 45 #include <linux/cn_proc.h> 46 #include <linux/mutex.h> 47 #include <linux/futex.h> 48 #include <linux/pipe_fs_i.h> 49 #include <linux/audit.h> /* for audit_free() */ 50 #include <linux/resource.h> 51 #include <linux/task_io_accounting_ops.h> 52 #include <linux/blkdev.h> 53 #include <linux/task_work.h> 54 #include <linux/fs_struct.h> 55 #include <linux/init_task.h> 56 #include <linux/perf_event.h> 57 #include <trace/events/sched.h> 58 #include <linux/hw_breakpoint.h> 59 #include <linux/oom.h> 60 #include <linux/writeback.h> 61 #include <linux/shm.h> 62 #include <linux/kcov.h> 63 #include <linux/kmsan.h> 64 #include <linux/random.h> 65 #include <linux/rcuwait.h> 66 #include <linux/compat.h> 67 #include <linux/io_uring.h> 68 #include <linux/kprobes.h> 69 #include <linux/rethook.h> 70 #include <linux/sysfs.h> 71 72 #include <linux/uaccess.h> 73 #include <asm/unistd.h> 74 #include <asm/mmu_context.h> 75 76 /* 77 * The default value should be high enough to not crash a system that randomly 78 * crashes its kernel from time to time, but low enough to at least not permit 79 * overflowing 32-bit refcounts or the ldsem writer count. 80 */ 81 static unsigned int oops_limit = 10000; 82 83 #ifdef CONFIG_SYSCTL 84 static struct ctl_table kern_exit_table[] = { 85 { 86 .procname = "oops_limit", 87 .data = &oops_limit, 88 .maxlen = sizeof(oops_limit), 89 .mode = 0644, 90 .proc_handler = proc_douintvec, 91 }, 92 { } 93 }; 94 95 static __init int kernel_exit_sysctls_init(void) 96 { 97 register_sysctl_init("kernel", kern_exit_table); 98 return 0; 99 } 100 late_initcall(kernel_exit_sysctls_init); 101 #endif 102 103 static atomic_t oops_count = ATOMIC_INIT(0); 104 105 #ifdef CONFIG_SYSFS 106 static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, 107 char *page) 108 { 109 return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); 110 } 111 112 static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); 113 114 static __init int kernel_exit_sysfs_init(void) 115 { 116 sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); 117 return 0; 118 } 119 late_initcall(kernel_exit_sysfs_init); 120 #endif 121 122 static void __unhash_process(struct task_struct *p, bool group_dead) 123 { 124 nr_threads--; 125 detach_pid(p, PIDTYPE_PID); 126 if (group_dead) { 127 detach_pid(p, PIDTYPE_TGID); 128 detach_pid(p, PIDTYPE_PGID); 129 detach_pid(p, PIDTYPE_SID); 130 131 list_del_rcu(&p->tasks); 132 list_del_init(&p->sibling); 133 __this_cpu_dec(process_counts); 134 } 135 list_del_rcu(&p->thread_group); 136 list_del_rcu(&p->thread_node); 137 } 138 139 /* 140 * This function expects the tasklist_lock write-locked. 141 */ 142 static void __exit_signal(struct task_struct *tsk) 143 { 144 struct signal_struct *sig = tsk->signal; 145 bool group_dead = thread_group_leader(tsk); 146 struct sighand_struct *sighand; 147 struct tty_struct *tty; 148 u64 utime, stime; 149 150 sighand = rcu_dereference_check(tsk->sighand, 151 lockdep_tasklist_lock_is_held()); 152 spin_lock(&sighand->siglock); 153 154 #ifdef CONFIG_POSIX_TIMERS 155 posix_cpu_timers_exit(tsk); 156 if (group_dead) 157 posix_cpu_timers_exit_group(tsk); 158 #endif 159 160 if (group_dead) { 161 tty = sig->tty; 162 sig->tty = NULL; 163 } else { 164 /* 165 * If there is any task waiting for the group exit 166 * then notify it: 167 */ 168 if (sig->notify_count > 0 && !--sig->notify_count) 169 wake_up_process(sig->group_exec_task); 170 171 if (tsk == sig->curr_target) 172 sig->curr_target = next_thread(tsk); 173 } 174 175 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 176 sizeof(unsigned long long)); 177 178 /* 179 * Accumulate here the counters for all threads as they die. We could 180 * skip the group leader because it is the last user of signal_struct, 181 * but we want to avoid the race with thread_group_cputime() which can 182 * see the empty ->thread_head list. 183 */ 184 task_cputime(tsk, &utime, &stime); 185 write_seqlock(&sig->stats_lock); 186 sig->utime += utime; 187 sig->stime += stime; 188 sig->gtime += task_gtime(tsk); 189 sig->min_flt += tsk->min_flt; 190 sig->maj_flt += tsk->maj_flt; 191 sig->nvcsw += tsk->nvcsw; 192 sig->nivcsw += tsk->nivcsw; 193 sig->inblock += task_io_get_inblock(tsk); 194 sig->oublock += task_io_get_oublock(tsk); 195 task_io_accounting_add(&sig->ioac, &tsk->ioac); 196 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 197 sig->nr_threads--; 198 __unhash_process(tsk, group_dead); 199 write_sequnlock(&sig->stats_lock); 200 201 /* 202 * Do this under ->siglock, we can race with another thread 203 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 204 */ 205 flush_sigqueue(&tsk->pending); 206 tsk->sighand = NULL; 207 spin_unlock(&sighand->siglock); 208 209 __cleanup_sighand(sighand); 210 clear_tsk_thread_flag(tsk, TIF_SIGPENDING); 211 if (group_dead) { 212 flush_sigqueue(&sig->shared_pending); 213 tty_kref_put(tty); 214 } 215 } 216 217 static void delayed_put_task_struct(struct rcu_head *rhp) 218 { 219 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 220 221 kprobe_flush_task(tsk); 222 rethook_flush_task(tsk); 223 perf_event_delayed_put(tsk); 224 trace_sched_process_free(tsk); 225 put_task_struct(tsk); 226 } 227 228 void put_task_struct_rcu_user(struct task_struct *task) 229 { 230 if (refcount_dec_and_test(&task->rcu_users)) 231 call_rcu(&task->rcu, delayed_put_task_struct); 232 } 233 234 void __weak release_thread(struct task_struct *dead_task) 235 { 236 } 237 238 void release_task(struct task_struct *p) 239 { 240 struct task_struct *leader; 241 struct pid *thread_pid; 242 int zap_leader; 243 repeat: 244 /* don't need to get the RCU readlock here - the process is dead and 245 * can't be modifying its own credentials. But shut RCU-lockdep up */ 246 rcu_read_lock(); 247 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 248 rcu_read_unlock(); 249 250 cgroup_release(p); 251 252 write_lock_irq(&tasklist_lock); 253 ptrace_release_task(p); 254 thread_pid = get_pid(p->thread_pid); 255 __exit_signal(p); 256 257 /* 258 * If we are the last non-leader member of the thread 259 * group, and the leader is zombie, then notify the 260 * group leader's parent process. (if it wants notification.) 261 */ 262 zap_leader = 0; 263 leader = p->group_leader; 264 if (leader != p && thread_group_empty(leader) 265 && leader->exit_state == EXIT_ZOMBIE) { 266 /* 267 * If we were the last child thread and the leader has 268 * exited already, and the leader's parent ignores SIGCHLD, 269 * then we are the one who should release the leader. 270 */ 271 zap_leader = do_notify_parent(leader, leader->exit_signal); 272 if (zap_leader) 273 leader->exit_state = EXIT_DEAD; 274 } 275 276 write_unlock_irq(&tasklist_lock); 277 seccomp_filter_release(p); 278 proc_flush_pid(thread_pid); 279 put_pid(thread_pid); 280 release_thread(p); 281 put_task_struct_rcu_user(p); 282 283 p = leader; 284 if (unlikely(zap_leader)) 285 goto repeat; 286 } 287 288 int rcuwait_wake_up(struct rcuwait *w) 289 { 290 int ret = 0; 291 struct task_struct *task; 292 293 rcu_read_lock(); 294 295 /* 296 * Order condition vs @task, such that everything prior to the load 297 * of @task is visible. This is the condition as to why the user called 298 * rcuwait_wake() in the first place. Pairs with set_current_state() 299 * barrier (A) in rcuwait_wait_event(). 300 * 301 * WAIT WAKE 302 * [S] tsk = current [S] cond = true 303 * MB (A) MB (B) 304 * [L] cond [L] tsk 305 */ 306 smp_mb(); /* (B) */ 307 308 task = rcu_dereference(w->task); 309 if (task) 310 ret = wake_up_process(task); 311 rcu_read_unlock(); 312 313 return ret; 314 } 315 EXPORT_SYMBOL_GPL(rcuwait_wake_up); 316 317 /* 318 * Determine if a process group is "orphaned", according to the POSIX 319 * definition in 2.2.2.52. Orphaned process groups are not to be affected 320 * by terminal-generated stop signals. Newly orphaned process groups are 321 * to receive a SIGHUP and a SIGCONT. 322 * 323 * "I ask you, have you ever known what it is to be an orphan?" 324 */ 325 static int will_become_orphaned_pgrp(struct pid *pgrp, 326 struct task_struct *ignored_task) 327 { 328 struct task_struct *p; 329 330 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 331 if ((p == ignored_task) || 332 (p->exit_state && thread_group_empty(p)) || 333 is_global_init(p->real_parent)) 334 continue; 335 336 if (task_pgrp(p->real_parent) != pgrp && 337 task_session(p->real_parent) == task_session(p)) 338 return 0; 339 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 340 341 return 1; 342 } 343 344 int is_current_pgrp_orphaned(void) 345 { 346 int retval; 347 348 read_lock(&tasklist_lock); 349 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); 350 read_unlock(&tasklist_lock); 351 352 return retval; 353 } 354 355 static bool has_stopped_jobs(struct pid *pgrp) 356 { 357 struct task_struct *p; 358 359 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 360 if (p->signal->flags & SIGNAL_STOP_STOPPED) 361 return true; 362 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 363 364 return false; 365 } 366 367 /* 368 * Check to see if any process groups have become orphaned as 369 * a result of our exiting, and if they have any stopped jobs, 370 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 371 */ 372 static void 373 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) 374 { 375 struct pid *pgrp = task_pgrp(tsk); 376 struct task_struct *ignored_task = tsk; 377 378 if (!parent) 379 /* exit: our father is in a different pgrp than 380 * we are and we were the only connection outside. 381 */ 382 parent = tsk->real_parent; 383 else 384 /* reparent: our child is in a different pgrp than 385 * we are, and it was the only connection outside. 386 */ 387 ignored_task = NULL; 388 389 if (task_pgrp(parent) != pgrp && 390 task_session(parent) == task_session(tsk) && 391 will_become_orphaned_pgrp(pgrp, ignored_task) && 392 has_stopped_jobs(pgrp)) { 393 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); 394 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); 395 } 396 } 397 398 static void coredump_task_exit(struct task_struct *tsk) 399 { 400 struct core_state *core_state; 401 402 /* 403 * Serialize with any possible pending coredump. 404 * We must hold siglock around checking core_state 405 * and setting PF_POSTCOREDUMP. The core-inducing thread 406 * will increment ->nr_threads for each thread in the 407 * group without PF_POSTCOREDUMP set. 408 */ 409 spin_lock_irq(&tsk->sighand->siglock); 410 tsk->flags |= PF_POSTCOREDUMP; 411 core_state = tsk->signal->core_state; 412 spin_unlock_irq(&tsk->sighand->siglock); 413 if (core_state) { 414 struct core_thread self; 415 416 self.task = current; 417 if (self.task->flags & PF_SIGNALED) 418 self.next = xchg(&core_state->dumper.next, &self); 419 else 420 self.task = NULL; 421 /* 422 * Implies mb(), the result of xchg() must be visible 423 * to core_state->dumper. 424 */ 425 if (atomic_dec_and_test(&core_state->nr_threads)) 426 complete(&core_state->startup); 427 428 for (;;) { 429 set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 430 if (!self.task) /* see coredump_finish() */ 431 break; 432 schedule(); 433 } 434 __set_current_state(TASK_RUNNING); 435 } 436 } 437 438 #ifdef CONFIG_MEMCG 439 /* 440 * A task is exiting. If it owned this mm, find a new owner for the mm. 441 */ 442 void mm_update_next_owner(struct mm_struct *mm) 443 { 444 struct task_struct *c, *g, *p = current; 445 446 retry: 447 /* 448 * If the exiting or execing task is not the owner, it's 449 * someone else's problem. 450 */ 451 if (mm->owner != p) 452 return; 453 /* 454 * The current owner is exiting/execing and there are no other 455 * candidates. Do not leave the mm pointing to a possibly 456 * freed task structure. 457 */ 458 if (atomic_read(&mm->mm_users) <= 1) { 459 WRITE_ONCE(mm->owner, NULL); 460 return; 461 } 462 463 read_lock(&tasklist_lock); 464 /* 465 * Search in the children 466 */ 467 list_for_each_entry(c, &p->children, sibling) { 468 if (c->mm == mm) 469 goto assign_new_owner; 470 } 471 472 /* 473 * Search in the siblings 474 */ 475 list_for_each_entry(c, &p->real_parent->children, sibling) { 476 if (c->mm == mm) 477 goto assign_new_owner; 478 } 479 480 /* 481 * Search through everything else, we should not get here often. 482 */ 483 for_each_process(g) { 484 if (g->flags & PF_KTHREAD) 485 continue; 486 for_each_thread(g, c) { 487 if (c->mm == mm) 488 goto assign_new_owner; 489 if (c->mm) 490 break; 491 } 492 } 493 read_unlock(&tasklist_lock); 494 /* 495 * We found no owner yet mm_users > 1: this implies that we are 496 * most likely racing with swapoff (try_to_unuse()) or /proc or 497 * ptrace or page migration (get_task_mm()). Mark owner as NULL. 498 */ 499 WRITE_ONCE(mm->owner, NULL); 500 return; 501 502 assign_new_owner: 503 BUG_ON(c == p); 504 get_task_struct(c); 505 /* 506 * The task_lock protects c->mm from changing. 507 * We always want mm->owner->mm == mm 508 */ 509 task_lock(c); 510 /* 511 * Delay read_unlock() till we have the task_lock() 512 * to ensure that c does not slip away underneath us 513 */ 514 read_unlock(&tasklist_lock); 515 if (c->mm != mm) { 516 task_unlock(c); 517 put_task_struct(c); 518 goto retry; 519 } 520 WRITE_ONCE(mm->owner, c); 521 lru_gen_migrate_mm(mm); 522 task_unlock(c); 523 put_task_struct(c); 524 } 525 #endif /* CONFIG_MEMCG */ 526 527 /* 528 * Turn us into a lazy TLB process if we 529 * aren't already.. 530 */ 531 static void exit_mm(void) 532 { 533 struct mm_struct *mm = current->mm; 534 535 exit_mm_release(current, mm); 536 if (!mm) 537 return; 538 sync_mm_rss(mm); 539 mmap_read_lock(mm); 540 mmgrab(mm); 541 BUG_ON(mm != current->active_mm); 542 /* more a memory barrier than a real lock */ 543 task_lock(current); 544 /* 545 * When a thread stops operating on an address space, the loop 546 * in membarrier_private_expedited() may not observe that 547 * tsk->mm, and the loop in membarrier_global_expedited() may 548 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED 549 * rq->membarrier_state, so those would not issue an IPI. 550 * Membarrier requires a memory barrier after accessing 551 * user-space memory, before clearing tsk->mm or the 552 * rq->membarrier_state. 553 */ 554 smp_mb__after_spinlock(); 555 local_irq_disable(); 556 current->mm = NULL; 557 membarrier_update_current_mm(NULL); 558 enter_lazy_tlb(mm, current); 559 local_irq_enable(); 560 task_unlock(current); 561 mmap_read_unlock(mm); 562 mm_update_next_owner(mm); 563 mmput(mm); 564 if (test_thread_flag(TIF_MEMDIE)) 565 exit_oom_victim(); 566 } 567 568 static struct task_struct *find_alive_thread(struct task_struct *p) 569 { 570 struct task_struct *t; 571 572 for_each_thread(p, t) { 573 if (!(t->flags & PF_EXITING)) 574 return t; 575 } 576 return NULL; 577 } 578 579 static struct task_struct *find_child_reaper(struct task_struct *father, 580 struct list_head *dead) 581 __releases(&tasklist_lock) 582 __acquires(&tasklist_lock) 583 { 584 struct pid_namespace *pid_ns = task_active_pid_ns(father); 585 struct task_struct *reaper = pid_ns->child_reaper; 586 struct task_struct *p, *n; 587 588 if (likely(reaper != father)) 589 return reaper; 590 591 reaper = find_alive_thread(father); 592 if (reaper) { 593 pid_ns->child_reaper = reaper; 594 return reaper; 595 } 596 597 write_unlock_irq(&tasklist_lock); 598 599 list_for_each_entry_safe(p, n, dead, ptrace_entry) { 600 list_del_init(&p->ptrace_entry); 601 release_task(p); 602 } 603 604 zap_pid_ns_processes(pid_ns); 605 write_lock_irq(&tasklist_lock); 606 607 return father; 608 } 609 610 /* 611 * When we die, we re-parent all our children, and try to: 612 * 1. give them to another thread in our thread group, if such a member exists 613 * 2. give it to the first ancestor process which prctl'd itself as a 614 * child_subreaper for its children (like a service manager) 615 * 3. give it to the init process (PID 1) in our pid namespace 616 */ 617 static struct task_struct *find_new_reaper(struct task_struct *father, 618 struct task_struct *child_reaper) 619 { 620 struct task_struct *thread, *reaper; 621 622 thread = find_alive_thread(father); 623 if (thread) 624 return thread; 625 626 if (father->signal->has_child_subreaper) { 627 unsigned int ns_level = task_pid(father)->level; 628 /* 629 * Find the first ->is_child_subreaper ancestor in our pid_ns. 630 * We can't check reaper != child_reaper to ensure we do not 631 * cross the namespaces, the exiting parent could be injected 632 * by setns() + fork(). 633 * We check pid->level, this is slightly more efficient than 634 * task_active_pid_ns(reaper) != task_active_pid_ns(father). 635 */ 636 for (reaper = father->real_parent; 637 task_pid(reaper)->level == ns_level; 638 reaper = reaper->real_parent) { 639 if (reaper == &init_task) 640 break; 641 if (!reaper->signal->is_child_subreaper) 642 continue; 643 thread = find_alive_thread(reaper); 644 if (thread) 645 return thread; 646 } 647 } 648 649 return child_reaper; 650 } 651 652 /* 653 * Any that need to be release_task'd are put on the @dead list. 654 */ 655 static void reparent_leader(struct task_struct *father, struct task_struct *p, 656 struct list_head *dead) 657 { 658 if (unlikely(p->exit_state == EXIT_DEAD)) 659 return; 660 661 /* We don't want people slaying init. */ 662 p->exit_signal = SIGCHLD; 663 664 /* If it has exited notify the new parent about this child's death. */ 665 if (!p->ptrace && 666 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 667 if (do_notify_parent(p, p->exit_signal)) { 668 p->exit_state = EXIT_DEAD; 669 list_add(&p->ptrace_entry, dead); 670 } 671 } 672 673 kill_orphaned_pgrp(p, father); 674 } 675 676 /* 677 * This does two things: 678 * 679 * A. Make init inherit all the child processes 680 * B. Check to see if any process groups have become orphaned 681 * as a result of our exiting, and if they have any stopped 682 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 683 */ 684 static void forget_original_parent(struct task_struct *father, 685 struct list_head *dead) 686 { 687 struct task_struct *p, *t, *reaper; 688 689 if (unlikely(!list_empty(&father->ptraced))) 690 exit_ptrace(father, dead); 691 692 /* Can drop and reacquire tasklist_lock */ 693 reaper = find_child_reaper(father, dead); 694 if (list_empty(&father->children)) 695 return; 696 697 reaper = find_new_reaper(father, reaper); 698 list_for_each_entry(p, &father->children, sibling) { 699 for_each_thread(p, t) { 700 RCU_INIT_POINTER(t->real_parent, reaper); 701 BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); 702 if (likely(!t->ptrace)) 703 t->parent = t->real_parent; 704 if (t->pdeath_signal) 705 group_send_sig_info(t->pdeath_signal, 706 SEND_SIG_NOINFO, t, 707 PIDTYPE_TGID); 708 } 709 /* 710 * If this is a threaded reparent there is no need to 711 * notify anyone anything has happened. 712 */ 713 if (!same_thread_group(reaper, father)) 714 reparent_leader(father, p, dead); 715 } 716 list_splice_tail_init(&father->children, &reaper->children); 717 } 718 719 /* 720 * Send signals to all our closest relatives so that they know 721 * to properly mourn us.. 722 */ 723 static void exit_notify(struct task_struct *tsk, int group_dead) 724 { 725 bool autoreap; 726 struct task_struct *p, *n; 727 LIST_HEAD(dead); 728 729 write_lock_irq(&tasklist_lock); 730 forget_original_parent(tsk, &dead); 731 732 if (group_dead) 733 kill_orphaned_pgrp(tsk->group_leader, NULL); 734 735 tsk->exit_state = EXIT_ZOMBIE; 736 if (unlikely(tsk->ptrace)) { 737 int sig = thread_group_leader(tsk) && 738 thread_group_empty(tsk) && 739 !ptrace_reparented(tsk) ? 740 tsk->exit_signal : SIGCHLD; 741 autoreap = do_notify_parent(tsk, sig); 742 } else if (thread_group_leader(tsk)) { 743 autoreap = thread_group_empty(tsk) && 744 do_notify_parent(tsk, tsk->exit_signal); 745 } else { 746 autoreap = true; 747 } 748 749 if (autoreap) { 750 tsk->exit_state = EXIT_DEAD; 751 list_add(&tsk->ptrace_entry, &dead); 752 } 753 754 /* mt-exec, de_thread() is waiting for group leader */ 755 if (unlikely(tsk->signal->notify_count < 0)) 756 wake_up_process(tsk->signal->group_exec_task); 757 write_unlock_irq(&tasklist_lock); 758 759 list_for_each_entry_safe(p, n, &dead, ptrace_entry) { 760 list_del_init(&p->ptrace_entry); 761 release_task(p); 762 } 763 } 764 765 #ifdef CONFIG_DEBUG_STACK_USAGE 766 static void check_stack_usage(void) 767 { 768 static DEFINE_SPINLOCK(low_water_lock); 769 static int lowest_to_date = THREAD_SIZE; 770 unsigned long free; 771 772 free = stack_not_used(current); 773 774 if (free >= lowest_to_date) 775 return; 776 777 spin_lock(&low_water_lock); 778 if (free < lowest_to_date) { 779 pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", 780 current->comm, task_pid_nr(current), free); 781 lowest_to_date = free; 782 } 783 spin_unlock(&low_water_lock); 784 } 785 #else 786 static inline void check_stack_usage(void) {} 787 #endif 788 789 static void synchronize_group_exit(struct task_struct *tsk, long code) 790 { 791 struct sighand_struct *sighand = tsk->sighand; 792 struct signal_struct *signal = tsk->signal; 793 794 spin_lock_irq(&sighand->siglock); 795 signal->quick_threads--; 796 if ((signal->quick_threads == 0) && 797 !(signal->flags & SIGNAL_GROUP_EXIT)) { 798 signal->flags = SIGNAL_GROUP_EXIT; 799 signal->group_exit_code = code; 800 signal->group_stop_count = 0; 801 } 802 spin_unlock_irq(&sighand->siglock); 803 } 804 805 void __noreturn do_exit(long code) 806 { 807 struct task_struct *tsk = current; 808 int group_dead; 809 810 synchronize_group_exit(tsk, code); 811 812 WARN_ON(tsk->plug); 813 814 kcov_task_exit(tsk); 815 kmsan_task_exit(tsk); 816 817 coredump_task_exit(tsk); 818 ptrace_event(PTRACE_EVENT_EXIT, code); 819 820 validate_creds_for_do_exit(tsk); 821 822 io_uring_files_cancel(); 823 exit_signals(tsk); /* sets PF_EXITING */ 824 825 /* sync mm's RSS info before statistics gathering */ 826 if (tsk->mm) 827 sync_mm_rss(tsk->mm); 828 acct_update_integrals(tsk); 829 group_dead = atomic_dec_and_test(&tsk->signal->live); 830 if (group_dead) { 831 /* 832 * If the last thread of global init has exited, panic 833 * immediately to get a useable coredump. 834 */ 835 if (unlikely(is_global_init(tsk))) 836 panic("Attempted to kill init! exitcode=0x%08x\n", 837 tsk->signal->group_exit_code ?: (int)code); 838 839 #ifdef CONFIG_POSIX_TIMERS 840 hrtimer_cancel(&tsk->signal->real_timer); 841 exit_itimers(tsk); 842 #endif 843 if (tsk->mm) 844 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); 845 } 846 acct_collect(code, group_dead); 847 if (group_dead) 848 tty_audit_exit(); 849 audit_free(tsk); 850 851 tsk->exit_code = code; 852 taskstats_exit(tsk, group_dead); 853 854 exit_mm(); 855 856 if (group_dead) 857 acct_process(); 858 trace_sched_process_exit(tsk); 859 860 exit_sem(tsk); 861 exit_shm(tsk); 862 exit_files(tsk); 863 exit_fs(tsk); 864 if (group_dead) 865 disassociate_ctty(1); 866 exit_task_namespaces(tsk); 867 exit_task_work(tsk); 868 exit_thread(tsk); 869 870 /* 871 * Flush inherited counters to the parent - before the parent 872 * gets woken up by child-exit notifications. 873 * 874 * because of cgroup mode, must be called before cgroup_exit() 875 */ 876 perf_event_exit_task(tsk); 877 878 sched_autogroup_exit_task(tsk); 879 cgroup_exit(tsk); 880 881 /* 882 * FIXME: do that only when needed, using sched_exit tracepoint 883 */ 884 flush_ptrace_hw_breakpoint(tsk); 885 886 exit_tasks_rcu_start(); 887 exit_notify(tsk, group_dead); 888 proc_exit_connector(tsk); 889 mpol_put_task_policy(tsk); 890 #ifdef CONFIG_FUTEX 891 if (unlikely(current->pi_state_cache)) 892 kfree(current->pi_state_cache); 893 #endif 894 /* 895 * Make sure we are holding no locks: 896 */ 897 debug_check_no_locks_held(); 898 899 if (tsk->io_context) 900 exit_io_context(tsk); 901 902 if (tsk->splice_pipe) 903 free_pipe_info(tsk->splice_pipe); 904 905 if (tsk->task_frag.page) 906 put_page(tsk->task_frag.page); 907 908 validate_creds_for_do_exit(tsk); 909 exit_task_stack_account(tsk); 910 911 check_stack_usage(); 912 preempt_disable(); 913 if (tsk->nr_dirtied) 914 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 915 exit_rcu(); 916 exit_tasks_rcu_finish(); 917 918 lockdep_free_task(tsk); 919 do_task_dead(); 920 } 921 922 void __noreturn make_task_dead(int signr) 923 { 924 /* 925 * Take the task off the cpu after something catastrophic has 926 * happened. 927 * 928 * We can get here from a kernel oops, sometimes with preemption off. 929 * Start by checking for critical errors. 930 * Then fix up important state like USER_DS and preemption. 931 * Then do everything else. 932 */ 933 struct task_struct *tsk = current; 934 unsigned int limit; 935 936 if (unlikely(in_interrupt())) 937 panic("Aiee, killing interrupt handler!"); 938 if (unlikely(!tsk->pid)) 939 panic("Attempted to kill the idle task!"); 940 941 if (unlikely(in_atomic())) { 942 pr_info("note: %s[%d] exited with preempt_count %d\n", 943 current->comm, task_pid_nr(current), 944 preempt_count()); 945 preempt_count_set(PREEMPT_ENABLED); 946 } 947 948 /* 949 * Every time the system oopses, if the oops happens while a reference 950 * to an object was held, the reference leaks. 951 * If the oops doesn't also leak memory, repeated oopsing can cause 952 * reference counters to wrap around (if they're not using refcount_t). 953 * This means that repeated oopsing can make unexploitable-looking bugs 954 * exploitable through repeated oopsing. 955 * To make sure this can't happen, place an upper bound on how often the 956 * kernel may oops without panic(). 957 */ 958 limit = READ_ONCE(oops_limit); 959 if (atomic_inc_return(&oops_count) >= limit && limit) 960 panic("Oopsed too often (kernel.oops_limit is %d)", limit); 961 962 /* 963 * We're taking recursive faults here in make_task_dead. Safest is to just 964 * leave this task alone and wait for reboot. 965 */ 966 if (unlikely(tsk->flags & PF_EXITING)) { 967 pr_alert("Fixing recursive fault but reboot is needed!\n"); 968 futex_exit_recursive(tsk); 969 tsk->exit_state = EXIT_DEAD; 970 refcount_inc(&tsk->rcu_users); 971 do_task_dead(); 972 } 973 974 do_exit(signr); 975 } 976 977 SYSCALL_DEFINE1(exit, int, error_code) 978 { 979 do_exit((error_code&0xff)<<8); 980 } 981 982 /* 983 * Take down every thread in the group. This is called by fatal signals 984 * as well as by sys_exit_group (below). 985 */ 986 void __noreturn 987 do_group_exit(int exit_code) 988 { 989 struct signal_struct *sig = current->signal; 990 991 if (sig->flags & SIGNAL_GROUP_EXIT) 992 exit_code = sig->group_exit_code; 993 else if (sig->group_exec_task) 994 exit_code = 0; 995 else { 996 struct sighand_struct *const sighand = current->sighand; 997 998 spin_lock_irq(&sighand->siglock); 999 if (sig->flags & SIGNAL_GROUP_EXIT) 1000 /* Another thread got here before we took the lock. */ 1001 exit_code = sig->group_exit_code; 1002 else if (sig->group_exec_task) 1003 exit_code = 0; 1004 else { 1005 sig->group_exit_code = exit_code; 1006 sig->flags = SIGNAL_GROUP_EXIT; 1007 zap_other_threads(current); 1008 } 1009 spin_unlock_irq(&sighand->siglock); 1010 } 1011 1012 do_exit(exit_code); 1013 /* NOTREACHED */ 1014 } 1015 1016 /* 1017 * this kills every thread in the thread group. Note that any externally 1018 * wait4()-ing process will get the correct exit code - even if this 1019 * thread is not the thread group leader. 1020 */ 1021 SYSCALL_DEFINE1(exit_group, int, error_code) 1022 { 1023 do_group_exit((error_code & 0xff) << 8); 1024 /* NOTREACHED */ 1025 return 0; 1026 } 1027 1028 struct waitid_info { 1029 pid_t pid; 1030 uid_t uid; 1031 int status; 1032 int cause; 1033 }; 1034 1035 struct wait_opts { 1036 enum pid_type wo_type; 1037 int wo_flags; 1038 struct pid *wo_pid; 1039 1040 struct waitid_info *wo_info; 1041 int wo_stat; 1042 struct rusage *wo_rusage; 1043 1044 wait_queue_entry_t child_wait; 1045 int notask_error; 1046 }; 1047 1048 static int eligible_pid(struct wait_opts *wo, struct task_struct *p) 1049 { 1050 return wo->wo_type == PIDTYPE_MAX || 1051 task_pid_type(p, wo->wo_type) == wo->wo_pid; 1052 } 1053 1054 static int 1055 eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) 1056 { 1057 if (!eligible_pid(wo, p)) 1058 return 0; 1059 1060 /* 1061 * Wait for all children (clone and not) if __WALL is set or 1062 * if it is traced by us. 1063 */ 1064 if (ptrace || (wo->wo_flags & __WALL)) 1065 return 1; 1066 1067 /* 1068 * Otherwise, wait for clone children *only* if __WCLONE is set; 1069 * otherwise, wait for non-clone children *only*. 1070 * 1071 * Note: a "clone" child here is one that reports to its parent 1072 * using a signal other than SIGCHLD, or a non-leader thread which 1073 * we can only see if it is traced by us. 1074 */ 1075 if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) 1076 return 0; 1077 1078 return 1; 1079 } 1080 1081 /* 1082 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1083 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1084 * the lock and this task is uninteresting. If we return nonzero, we have 1085 * released the lock and the system call should return. 1086 */ 1087 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1088 { 1089 int state, status; 1090 pid_t pid = task_pid_vnr(p); 1091 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1092 struct waitid_info *infop; 1093 1094 if (!likely(wo->wo_flags & WEXITED)) 1095 return 0; 1096 1097 if (unlikely(wo->wo_flags & WNOWAIT)) { 1098 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1099 ? p->signal->group_exit_code : p->exit_code; 1100 get_task_struct(p); 1101 read_unlock(&tasklist_lock); 1102 sched_annotate_sleep(); 1103 if (wo->wo_rusage) 1104 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1105 put_task_struct(p); 1106 goto out_info; 1107 } 1108 /* 1109 * Move the task's state to DEAD/TRACE, only one thread can do this. 1110 */ 1111 state = (ptrace_reparented(p) && thread_group_leader(p)) ? 1112 EXIT_TRACE : EXIT_DEAD; 1113 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1114 return 0; 1115 /* 1116 * We own this thread, nobody else can reap it. 1117 */ 1118 read_unlock(&tasklist_lock); 1119 sched_annotate_sleep(); 1120 1121 /* 1122 * Check thread_group_leader() to exclude the traced sub-threads. 1123 */ 1124 if (state == EXIT_DEAD && thread_group_leader(p)) { 1125 struct signal_struct *sig = p->signal; 1126 struct signal_struct *psig = current->signal; 1127 unsigned long maxrss; 1128 u64 tgutime, tgstime; 1129 1130 /* 1131 * The resource counters for the group leader are in its 1132 * own task_struct. Those for dead threads in the group 1133 * are in its signal_struct, as are those for the child 1134 * processes it has previously reaped. All these 1135 * accumulate in the parent's signal_struct c* fields. 1136 * 1137 * We don't bother to take a lock here to protect these 1138 * p->signal fields because the whole thread group is dead 1139 * and nobody can change them. 1140 * 1141 * psig->stats_lock also protects us from our sub-threads 1142 * which can reap other children at the same time. Until 1143 * we change k_getrusage()-like users to rely on this lock 1144 * we have to take ->siglock as well. 1145 * 1146 * We use thread_group_cputime_adjusted() to get times for 1147 * the thread group, which consolidates times for all threads 1148 * in the group including the group leader. 1149 */ 1150 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1151 spin_lock_irq(¤t->sighand->siglock); 1152 write_seqlock(&psig->stats_lock); 1153 psig->cutime += tgutime + sig->cutime; 1154 psig->cstime += tgstime + sig->cstime; 1155 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1156 psig->cmin_flt += 1157 p->min_flt + sig->min_flt + sig->cmin_flt; 1158 psig->cmaj_flt += 1159 p->maj_flt + sig->maj_flt + sig->cmaj_flt; 1160 psig->cnvcsw += 1161 p->nvcsw + sig->nvcsw + sig->cnvcsw; 1162 psig->cnivcsw += 1163 p->nivcsw + sig->nivcsw + sig->cnivcsw; 1164 psig->cinblock += 1165 task_io_get_inblock(p) + 1166 sig->inblock + sig->cinblock; 1167 psig->coublock += 1168 task_io_get_oublock(p) + 1169 sig->oublock + sig->coublock; 1170 maxrss = max(sig->maxrss, sig->cmaxrss); 1171 if (psig->cmaxrss < maxrss) 1172 psig->cmaxrss = maxrss; 1173 task_io_accounting_add(&psig->ioac, &p->ioac); 1174 task_io_accounting_add(&psig->ioac, &sig->ioac); 1175 write_sequnlock(&psig->stats_lock); 1176 spin_unlock_irq(¤t->sighand->siglock); 1177 } 1178 1179 if (wo->wo_rusage) 1180 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1181 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1182 ? p->signal->group_exit_code : p->exit_code; 1183 wo->wo_stat = status; 1184 1185 if (state == EXIT_TRACE) { 1186 write_lock_irq(&tasklist_lock); 1187 /* We dropped tasklist, ptracer could die and untrace */ 1188 ptrace_unlink(p); 1189 1190 /* If parent wants a zombie, don't release it now */ 1191 state = EXIT_ZOMBIE; 1192 if (do_notify_parent(p, p->exit_signal)) 1193 state = EXIT_DEAD; 1194 p->exit_state = state; 1195 write_unlock_irq(&tasklist_lock); 1196 } 1197 if (state == EXIT_DEAD) 1198 release_task(p); 1199 1200 out_info: 1201 infop = wo->wo_info; 1202 if (infop) { 1203 if ((status & 0x7f) == 0) { 1204 infop->cause = CLD_EXITED; 1205 infop->status = status >> 8; 1206 } else { 1207 infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; 1208 infop->status = status & 0x7f; 1209 } 1210 infop->pid = pid; 1211 infop->uid = uid; 1212 } 1213 1214 return pid; 1215 } 1216 1217 static int *task_stopped_code(struct task_struct *p, bool ptrace) 1218 { 1219 if (ptrace) { 1220 if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) 1221 return &p->exit_code; 1222 } else { 1223 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1224 return &p->signal->group_exit_code; 1225 } 1226 return NULL; 1227 } 1228 1229 /** 1230 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED 1231 * @wo: wait options 1232 * @ptrace: is the wait for ptrace 1233 * @p: task to wait for 1234 * 1235 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. 1236 * 1237 * CONTEXT: 1238 * read_lock(&tasklist_lock), which is released if return value is 1239 * non-zero. Also, grabs and releases @p->sighand->siglock. 1240 * 1241 * RETURNS: 1242 * 0 if wait condition didn't exist and search for other wait conditions 1243 * should continue. Non-zero return, -errno on failure and @p's pid on 1244 * success, implies that tasklist_lock is released and wait condition 1245 * search should terminate. 1246 */ 1247 static int wait_task_stopped(struct wait_opts *wo, 1248 int ptrace, struct task_struct *p) 1249 { 1250 struct waitid_info *infop; 1251 int exit_code, *p_code, why; 1252 uid_t uid = 0; /* unneeded, required by compiler */ 1253 pid_t pid; 1254 1255 /* 1256 * Traditionally we see ptrace'd stopped tasks regardless of options. 1257 */ 1258 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1259 return 0; 1260 1261 if (!task_stopped_code(p, ptrace)) 1262 return 0; 1263 1264 exit_code = 0; 1265 spin_lock_irq(&p->sighand->siglock); 1266 1267 p_code = task_stopped_code(p, ptrace); 1268 if (unlikely(!p_code)) 1269 goto unlock_sig; 1270 1271 exit_code = *p_code; 1272 if (!exit_code) 1273 goto unlock_sig; 1274 1275 if (!unlikely(wo->wo_flags & WNOWAIT)) 1276 *p_code = 0; 1277 1278 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1279 unlock_sig: 1280 spin_unlock_irq(&p->sighand->siglock); 1281 if (!exit_code) 1282 return 0; 1283 1284 /* 1285 * Now we are pretty sure this task is interesting. 1286 * Make sure it doesn't get reaped out from under us while we 1287 * give up the lock and then examine it below. We don't want to 1288 * keep holding onto the tasklist_lock while we call getrusage and 1289 * possibly take page faults for user memory. 1290 */ 1291 get_task_struct(p); 1292 pid = task_pid_vnr(p); 1293 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1294 read_unlock(&tasklist_lock); 1295 sched_annotate_sleep(); 1296 if (wo->wo_rusage) 1297 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1298 put_task_struct(p); 1299 1300 if (likely(!(wo->wo_flags & WNOWAIT))) 1301 wo->wo_stat = (exit_code << 8) | 0x7f; 1302 1303 infop = wo->wo_info; 1304 if (infop) { 1305 infop->cause = why; 1306 infop->status = exit_code; 1307 infop->pid = pid; 1308 infop->uid = uid; 1309 } 1310 return pid; 1311 } 1312 1313 /* 1314 * Handle do_wait work for one task in a live, non-stopped state. 1315 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1316 * the lock and this task is uninteresting. If we return nonzero, we have 1317 * released the lock and the system call should return. 1318 */ 1319 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1320 { 1321 struct waitid_info *infop; 1322 pid_t pid; 1323 uid_t uid; 1324 1325 if (!unlikely(wo->wo_flags & WCONTINUED)) 1326 return 0; 1327 1328 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1329 return 0; 1330 1331 spin_lock_irq(&p->sighand->siglock); 1332 /* Re-check with the lock held. */ 1333 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { 1334 spin_unlock_irq(&p->sighand->siglock); 1335 return 0; 1336 } 1337 if (!unlikely(wo->wo_flags & WNOWAIT)) 1338 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1339 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1340 spin_unlock_irq(&p->sighand->siglock); 1341 1342 pid = task_pid_vnr(p); 1343 get_task_struct(p); 1344 read_unlock(&tasklist_lock); 1345 sched_annotate_sleep(); 1346 if (wo->wo_rusage) 1347 getrusage(p, RUSAGE_BOTH, wo->wo_rusage); 1348 put_task_struct(p); 1349 1350 infop = wo->wo_info; 1351 if (!infop) { 1352 wo->wo_stat = 0xffff; 1353 } else { 1354 infop->cause = CLD_CONTINUED; 1355 infop->pid = pid; 1356 infop->uid = uid; 1357 infop->status = SIGCONT; 1358 } 1359 return pid; 1360 } 1361 1362 /* 1363 * Consider @p for a wait by @parent. 1364 * 1365 * -ECHILD should be in ->notask_error before the first call. 1366 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1367 * Returns zero if the search for a child should continue; 1368 * then ->notask_error is 0 if @p is an eligible child, 1369 * or still -ECHILD. 1370 */ 1371 static int wait_consider_task(struct wait_opts *wo, int ptrace, 1372 struct task_struct *p) 1373 { 1374 /* 1375 * We can race with wait_task_zombie() from another thread. 1376 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition 1377 * can't confuse the checks below. 1378 */ 1379 int exit_state = READ_ONCE(p->exit_state); 1380 int ret; 1381 1382 if (unlikely(exit_state == EXIT_DEAD)) 1383 return 0; 1384 1385 ret = eligible_child(wo, ptrace, p); 1386 if (!ret) 1387 return ret; 1388 1389 if (unlikely(exit_state == EXIT_TRACE)) { 1390 /* 1391 * ptrace == 0 means we are the natural parent. In this case 1392 * we should clear notask_error, debugger will notify us. 1393 */ 1394 if (likely(!ptrace)) 1395 wo->notask_error = 0; 1396 return 0; 1397 } 1398 1399 if (likely(!ptrace) && unlikely(p->ptrace)) { 1400 /* 1401 * If it is traced by its real parent's group, just pretend 1402 * the caller is ptrace_do_wait() and reap this child if it 1403 * is zombie. 1404 * 1405 * This also hides group stop state from real parent; otherwise 1406 * a single stop can be reported twice as group and ptrace stop. 1407 * If a ptracer wants to distinguish these two events for its 1408 * own children it should create a separate process which takes 1409 * the role of real parent. 1410 */ 1411 if (!ptrace_reparented(p)) 1412 ptrace = 1; 1413 } 1414 1415 /* slay zombie? */ 1416 if (exit_state == EXIT_ZOMBIE) { 1417 /* we don't reap group leaders with subthreads */ 1418 if (!delay_group_leader(p)) { 1419 /* 1420 * A zombie ptracee is only visible to its ptracer. 1421 * Notification and reaping will be cascaded to the 1422 * real parent when the ptracer detaches. 1423 */ 1424 if (unlikely(ptrace) || likely(!p->ptrace)) 1425 return wait_task_zombie(wo, p); 1426 } 1427 1428 /* 1429 * Allow access to stopped/continued state via zombie by 1430 * falling through. Clearing of notask_error is complex. 1431 * 1432 * When !@ptrace: 1433 * 1434 * If WEXITED is set, notask_error should naturally be 1435 * cleared. If not, subset of WSTOPPED|WCONTINUED is set, 1436 * so, if there are live subthreads, there are events to 1437 * wait for. If all subthreads are dead, it's still safe 1438 * to clear - this function will be called again in finite 1439 * amount time once all the subthreads are released and 1440 * will then return without clearing. 1441 * 1442 * When @ptrace: 1443 * 1444 * Stopped state is per-task and thus can't change once the 1445 * target task dies. Only continued and exited can happen. 1446 * Clear notask_error if WCONTINUED | WEXITED. 1447 */ 1448 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) 1449 wo->notask_error = 0; 1450 } else { 1451 /* 1452 * @p is alive and it's gonna stop, continue or exit, so 1453 * there always is something to wait for. 1454 */ 1455 wo->notask_error = 0; 1456 } 1457 1458 /* 1459 * Wait for stopped. Depending on @ptrace, different stopped state 1460 * is used and the two don't interact with each other. 1461 */ 1462 ret = wait_task_stopped(wo, ptrace, p); 1463 if (ret) 1464 return ret; 1465 1466 /* 1467 * Wait for continued. There's only one continued state and the 1468 * ptracer can consume it which can confuse the real parent. Don't 1469 * use WCONTINUED from ptracer. You don't need or want it. 1470 */ 1471 return wait_task_continued(wo, p); 1472 } 1473 1474 /* 1475 * Do the work of do_wait() for one thread in the group, @tsk. 1476 * 1477 * -ECHILD should be in ->notask_error before the first call. 1478 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1479 * Returns zero if the search for a child should continue; then 1480 * ->notask_error is 0 if there were any eligible children, 1481 * or still -ECHILD. 1482 */ 1483 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) 1484 { 1485 struct task_struct *p; 1486 1487 list_for_each_entry(p, &tsk->children, sibling) { 1488 int ret = wait_consider_task(wo, 0, p); 1489 1490 if (ret) 1491 return ret; 1492 } 1493 1494 return 0; 1495 } 1496 1497 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) 1498 { 1499 struct task_struct *p; 1500 1501 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1502 int ret = wait_consider_task(wo, 1, p); 1503 1504 if (ret) 1505 return ret; 1506 } 1507 1508 return 0; 1509 } 1510 1511 static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, 1512 int sync, void *key) 1513 { 1514 struct wait_opts *wo = container_of(wait, struct wait_opts, 1515 child_wait); 1516 struct task_struct *p = key; 1517 1518 if (!eligible_pid(wo, p)) 1519 return 0; 1520 1521 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) 1522 return 0; 1523 1524 return default_wake_function(wait, mode, sync, key); 1525 } 1526 1527 void __wake_up_parent(struct task_struct *p, struct task_struct *parent) 1528 { 1529 __wake_up_sync_key(&parent->signal->wait_chldexit, 1530 TASK_INTERRUPTIBLE, p); 1531 } 1532 1533 static bool is_effectively_child(struct wait_opts *wo, bool ptrace, 1534 struct task_struct *target) 1535 { 1536 struct task_struct *parent = 1537 !ptrace ? target->real_parent : target->parent; 1538 1539 return current == parent || (!(wo->wo_flags & __WNOTHREAD) && 1540 same_thread_group(current, parent)); 1541 } 1542 1543 /* 1544 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child 1545 * and tracee lists to find the target task. 1546 */ 1547 static int do_wait_pid(struct wait_opts *wo) 1548 { 1549 bool ptrace; 1550 struct task_struct *target; 1551 int retval; 1552 1553 ptrace = false; 1554 target = pid_task(wo->wo_pid, PIDTYPE_TGID); 1555 if (target && is_effectively_child(wo, ptrace, target)) { 1556 retval = wait_consider_task(wo, ptrace, target); 1557 if (retval) 1558 return retval; 1559 } 1560 1561 ptrace = true; 1562 target = pid_task(wo->wo_pid, PIDTYPE_PID); 1563 if (target && target->ptrace && 1564 is_effectively_child(wo, ptrace, target)) { 1565 retval = wait_consider_task(wo, ptrace, target); 1566 if (retval) 1567 return retval; 1568 } 1569 1570 return 0; 1571 } 1572 1573 static long do_wait(struct wait_opts *wo) 1574 { 1575 int retval; 1576 1577 trace_sched_process_wait(wo->wo_pid); 1578 1579 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); 1580 wo->child_wait.private = current; 1581 add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1582 repeat: 1583 /* 1584 * If there is nothing that can match our criteria, just get out. 1585 * We will clear ->notask_error to zero if we see any child that 1586 * might later match our criteria, even if we are not able to reap 1587 * it yet. 1588 */ 1589 wo->notask_error = -ECHILD; 1590 if ((wo->wo_type < PIDTYPE_MAX) && 1591 (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) 1592 goto notask; 1593 1594 set_current_state(TASK_INTERRUPTIBLE); 1595 read_lock(&tasklist_lock); 1596 1597 if (wo->wo_type == PIDTYPE_PID) { 1598 retval = do_wait_pid(wo); 1599 if (retval) 1600 goto end; 1601 } else { 1602 struct task_struct *tsk = current; 1603 1604 do { 1605 retval = do_wait_thread(wo, tsk); 1606 if (retval) 1607 goto end; 1608 1609 retval = ptrace_do_wait(wo, tsk); 1610 if (retval) 1611 goto end; 1612 1613 if (wo->wo_flags & __WNOTHREAD) 1614 break; 1615 } while_each_thread(current, tsk); 1616 } 1617 read_unlock(&tasklist_lock); 1618 1619 notask: 1620 retval = wo->notask_error; 1621 if (!retval && !(wo->wo_flags & WNOHANG)) { 1622 retval = -ERESTARTSYS; 1623 if (!signal_pending(current)) { 1624 schedule(); 1625 goto repeat; 1626 } 1627 } 1628 end: 1629 __set_current_state(TASK_RUNNING); 1630 remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1631 return retval; 1632 } 1633 1634 static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, 1635 int options, struct rusage *ru) 1636 { 1637 struct wait_opts wo; 1638 struct pid *pid = NULL; 1639 enum pid_type type; 1640 long ret; 1641 unsigned int f_flags = 0; 1642 1643 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| 1644 __WNOTHREAD|__WCLONE|__WALL)) 1645 return -EINVAL; 1646 if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) 1647 return -EINVAL; 1648 1649 switch (which) { 1650 case P_ALL: 1651 type = PIDTYPE_MAX; 1652 break; 1653 case P_PID: 1654 type = PIDTYPE_PID; 1655 if (upid <= 0) 1656 return -EINVAL; 1657 1658 pid = find_get_pid(upid); 1659 break; 1660 case P_PGID: 1661 type = PIDTYPE_PGID; 1662 if (upid < 0) 1663 return -EINVAL; 1664 1665 if (upid) 1666 pid = find_get_pid(upid); 1667 else 1668 pid = get_task_pid(current, PIDTYPE_PGID); 1669 break; 1670 case P_PIDFD: 1671 type = PIDTYPE_PID; 1672 if (upid < 0) 1673 return -EINVAL; 1674 1675 pid = pidfd_get_pid(upid, &f_flags); 1676 if (IS_ERR(pid)) 1677 return PTR_ERR(pid); 1678 1679 break; 1680 default: 1681 return -EINVAL; 1682 } 1683 1684 wo.wo_type = type; 1685 wo.wo_pid = pid; 1686 wo.wo_flags = options; 1687 wo.wo_info = infop; 1688 wo.wo_rusage = ru; 1689 if (f_flags & O_NONBLOCK) 1690 wo.wo_flags |= WNOHANG; 1691 1692 ret = do_wait(&wo); 1693 if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) 1694 ret = -EAGAIN; 1695 1696 put_pid(pid); 1697 return ret; 1698 } 1699 1700 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1701 infop, int, options, struct rusage __user *, ru) 1702 { 1703 struct rusage r; 1704 struct waitid_info info = {.status = 0}; 1705 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); 1706 int signo = 0; 1707 1708 if (err > 0) { 1709 signo = SIGCHLD; 1710 err = 0; 1711 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1712 return -EFAULT; 1713 } 1714 if (!infop) 1715 return err; 1716 1717 if (!user_write_access_begin(infop, sizeof(*infop))) 1718 return -EFAULT; 1719 1720 unsafe_put_user(signo, &infop->si_signo, Efault); 1721 unsafe_put_user(0, &infop->si_errno, Efault); 1722 unsafe_put_user(info.cause, &infop->si_code, Efault); 1723 unsafe_put_user(info.pid, &infop->si_pid, Efault); 1724 unsafe_put_user(info.uid, &infop->si_uid, Efault); 1725 unsafe_put_user(info.status, &infop->si_status, Efault); 1726 user_write_access_end(); 1727 return err; 1728 Efault: 1729 user_write_access_end(); 1730 return -EFAULT; 1731 } 1732 1733 long kernel_wait4(pid_t upid, int __user *stat_addr, int options, 1734 struct rusage *ru) 1735 { 1736 struct wait_opts wo; 1737 struct pid *pid = NULL; 1738 enum pid_type type; 1739 long ret; 1740 1741 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| 1742 __WNOTHREAD|__WCLONE|__WALL)) 1743 return -EINVAL; 1744 1745 /* -INT_MIN is not defined */ 1746 if (upid == INT_MIN) 1747 return -ESRCH; 1748 1749 if (upid == -1) 1750 type = PIDTYPE_MAX; 1751 else if (upid < 0) { 1752 type = PIDTYPE_PGID; 1753 pid = find_get_pid(-upid); 1754 } else if (upid == 0) { 1755 type = PIDTYPE_PGID; 1756 pid = get_task_pid(current, PIDTYPE_PGID); 1757 } else /* upid > 0 */ { 1758 type = PIDTYPE_PID; 1759 pid = find_get_pid(upid); 1760 } 1761 1762 wo.wo_type = type; 1763 wo.wo_pid = pid; 1764 wo.wo_flags = options | WEXITED; 1765 wo.wo_info = NULL; 1766 wo.wo_stat = 0; 1767 wo.wo_rusage = ru; 1768 ret = do_wait(&wo); 1769 put_pid(pid); 1770 if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) 1771 ret = -EFAULT; 1772 1773 return ret; 1774 } 1775 1776 int kernel_wait(pid_t pid, int *stat) 1777 { 1778 struct wait_opts wo = { 1779 .wo_type = PIDTYPE_PID, 1780 .wo_pid = find_get_pid(pid), 1781 .wo_flags = WEXITED, 1782 }; 1783 int ret; 1784 1785 ret = do_wait(&wo); 1786 if (ret > 0 && wo.wo_stat) 1787 *stat = wo.wo_stat; 1788 put_pid(wo.wo_pid); 1789 return ret; 1790 } 1791 1792 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1793 int, options, struct rusage __user *, ru) 1794 { 1795 struct rusage r; 1796 long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); 1797 1798 if (err > 0) { 1799 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1800 return -EFAULT; 1801 } 1802 return err; 1803 } 1804 1805 #ifdef __ARCH_WANT_SYS_WAITPID 1806 1807 /* 1808 * sys_waitpid() remains for compatibility. waitpid() should be 1809 * implemented by calling sys_wait4() from libc.a. 1810 */ 1811 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) 1812 { 1813 return kernel_wait4(pid, stat_addr, options, NULL); 1814 } 1815 1816 #endif 1817 1818 #ifdef CONFIG_COMPAT 1819 COMPAT_SYSCALL_DEFINE4(wait4, 1820 compat_pid_t, pid, 1821 compat_uint_t __user *, stat_addr, 1822 int, options, 1823 struct compat_rusage __user *, ru) 1824 { 1825 struct rusage r; 1826 long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); 1827 if (err > 0) { 1828 if (ru && put_compat_rusage(&r, ru)) 1829 return -EFAULT; 1830 } 1831 return err; 1832 } 1833 1834 COMPAT_SYSCALL_DEFINE5(waitid, 1835 int, which, compat_pid_t, pid, 1836 struct compat_siginfo __user *, infop, int, options, 1837 struct compat_rusage __user *, uru) 1838 { 1839 struct rusage ru; 1840 struct waitid_info info = {.status = 0}; 1841 long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); 1842 int signo = 0; 1843 if (err > 0) { 1844 signo = SIGCHLD; 1845 err = 0; 1846 if (uru) { 1847 /* kernel_waitid() overwrites everything in ru */ 1848 if (COMPAT_USE_64BIT_TIME) 1849 err = copy_to_user(uru, &ru, sizeof(ru)); 1850 else 1851 err = put_compat_rusage(&ru, uru); 1852 if (err) 1853 return -EFAULT; 1854 } 1855 } 1856 1857 if (!infop) 1858 return err; 1859 1860 if (!user_write_access_begin(infop, sizeof(*infop))) 1861 return -EFAULT; 1862 1863 unsafe_put_user(signo, &infop->si_signo, Efault); 1864 unsafe_put_user(0, &infop->si_errno, Efault); 1865 unsafe_put_user(info.cause, &infop->si_code, Efault); 1866 unsafe_put_user(info.pid, &infop->si_pid, Efault); 1867 unsafe_put_user(info.uid, &infop->si_uid, Efault); 1868 unsafe_put_user(info.status, &infop->si_status, Efault); 1869 user_write_access_end(); 1870 return err; 1871 Efault: 1872 user_write_access_end(); 1873 return -EFAULT; 1874 } 1875 #endif 1876 1877 /** 1878 * thread_group_exited - check that a thread group has exited 1879 * @pid: tgid of thread group to be checked. 1880 * 1881 * Test if the thread group represented by tgid has exited (all 1882 * threads are zombies, dead or completely gone). 1883 * 1884 * Return: true if the thread group has exited. false otherwise. 1885 */ 1886 bool thread_group_exited(struct pid *pid) 1887 { 1888 struct task_struct *task; 1889 bool exited; 1890 1891 rcu_read_lock(); 1892 task = pid_task(pid, PIDTYPE_PID); 1893 exited = !task || 1894 (READ_ONCE(task->exit_state) && thread_group_empty(task)); 1895 rcu_read_unlock(); 1896 1897 return exited; 1898 } 1899 EXPORT_SYMBOL(thread_group_exited); 1900 1901 __weak void abort(void) 1902 { 1903 BUG(); 1904 1905 /* if that doesn't kill us, halt */ 1906 panic("Oops failed to kill thread"); 1907 } 1908 EXPORT_SYMBOL(abort); 1909